In [None]:
# Cyber Insurance ML Project (90+ Cells Version)

# --------------------------------------------
# Part 1: Setup & Dataset Generation (Cells 1-20)
# --------------------------------------------

# Install dependencies
!pip install xgboost flask-ngrok imbalanced-learn

import pandas as pd
import numpy as np
import random
import os

# Dataset generation
industry_types = [
    'Finance', 'Healthcare', 'Education', 'E-commerce', 'Energy',
    'Manufacturing', 'Transportation', 'Government', 'Small Business', 'Cloud Services',
    'Retail', 'Legal', 'Telecom', 'Insurance', 'Media',
    'Hospitality', 'Pharmaceuticals', 'Technology', 'Logistics', 'Construction'
]

threat_levels = ['Low', 'Medium', 'High']
descriptions = {
    'Low': [
        "Minor phishing email detected and blocked.",
        "Non-critical system scan revealed outdated software.",
        "Unsuccessful login attempts recorded on admin panel."
    ],
    'Medium': [
        "Suspicious network traffic suggesting malware.",
        "Unauthorized access to a test server.",
        "Compromised credentials used for lateral movement."
    ],
    'High': [
        "Sensitive data exfiltrated via ransomware attack.",
        "Critical database breached through zero-day vulnerability.",
        "Widespread phishing led to multiple account takeovers."
    ]
}


def generate_dataset(industry, samples=100):
    data = []
    for _ in range(samples):
        size = random.randint(50, 1000)
        threat = random.choice(threat_levels)
        vuln_score = round(random.uniform(2.0, 9.5), 2)
        cost = round(random.uniform(5000, 150000), 2)
        approved = 1 if threat == 'High' or cost > 30000 else random.choice([0, 1])
        amount = round(cost * random.uniform(1.05, 1.5), 2) if approved else 0
        description = random.choice(descriptions[threat])
        data.append([description, size, industry, threat, vuln_score, cost, approved, amount])

    df = pd.DataFrame(data, columns=[
        'incident_description', 'company_size', 'industry_type', 'threat_level',
        'vulnerability_score', 'incident_cost', 'claim_approved', 'claim_amount'
    ])
    return df

# Save datasets
output_dir = "/content/cyber_datasets"
os.makedirs(output_dir, exist_ok=True)
for industry in industry_types:
    df = generate_dataset(industry)
    filename = f"cyber_incident_{industry.lower().replace(' ', '_')}.csv"
    df.to_csv(os.path.join(output_dir, filename), index=False)


# --------------------------------------------
# Part 2: Data Preprocessing (Cells 21-30)
# --------------------------------------------

import glob
from sklearn.preprocessing import StandardScaler

files = glob.glob("/content/cyber_datasets/*.csv")
df_list = [pd.read_csv(f) for f in files]
data = pd.concat(df_list, ignore_index=True)

# One-hot encode
data = pd.get_dummies(data, columns=['industry_type', 'threat_level'])

# Drop nulls if any
data.dropna(inplace=True)

X = data.drop(['incident_description', 'claim_approved', 'claim_amount'], axis=1)
y_class = data['claim_approved']
y_reg = data['claim_amount']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# --------------------------------------------
# Part 3: ML Classification Models (Cells 31-60)
# --------------------------------------------

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)

# Balance data
sm = SMOTE()
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Grid search RF
grid_rf = GridSearchCV(RandomForestClassifier(), {
    'n_estimators': [100, 200],
    'max_depth': [10, None],
    'min_samples_split': [2, 5]
}, cv=3, scoring='f1')
grid_rf.fit(X_resampled, y_resampled)
best_rf = grid_rf.best_estimator_

# XGBoost
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_resampled, y_resampled)

# Logistic
log_clf = LogisticRegression(max_iter=300)
log_clf.fit(X_resampled, y_resampled)

# Evaluate
for model, name in zip([best_rf, xgb_clf, log_clf], ['RF', 'XGB', 'LOG']):
    pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, pred)}")
    print(f"{name} F1: {f1_score(y_test, pred)}")


# --------------------------------------------
# Part 4: Regression Model (Claim Amount)
# --------------------------------------------

from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
# Simple trick: repeat y_reg values for the number of samples generated
y_reg_resampled = np.tile(y_reg.values, int(len(X_resampled)/len(y_reg)) + 1)[:len(X_resampled)]

# Train the regression model
rf_reg.fit(X_resampled, y_reg_resampled)



# --------------------------------------------
# Part 5: Monte Carlo Simulation (Cells 61-70)
# --------------------------------------------

def monte_carlo_simulation(model, X, iterations=1000):
    predictions = []
    for _ in range(iterations):
        noise = np.random.normal(0, 0.01, X.shape)
        X_noise = X + noise
        pred = model.predict(X_noise)
        predictions.append(pred)
    return np.mean(predictions, axis=0)


# --------------------------------------------
# Part 6: Flask Web App (Cells 71-90)
# --------------------------------------------

from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    features = np.array([[
        data['company_size'],
        data['vulnerability_score'],
        data['incident_cost']
    ] + data['industry_onehots'] + data['threat_onehots']])

    features_scaled = scaler.transform(features)
    claim = int(best_rf.predict(features_scaled)[0])
    amount = float(monte_carlo_simulation(rf_reg, features_scaled, 1000)[0])

    return jsonify({
        'claim_approved': claim,
        'estimated_claim_amount': round(amount, 2)
    })

app.run()


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


Parameters: { "use_label_encoder" } are not used.



RF Accuracy: 0.9475
RF F1: 0.9725490196078431
XGB Accuracy: 0.9525
XGB F1: 0.9750328515111695
LOG Accuracy: 0.93
LOG F1: 0.9622641509433962
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 493, in _make_reques