In [1]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker and set seed for reproducibility
fake = Faker()
Faker.seed(0)
random.seed(0)

# Define possible values for some fields
vehicle_makes = ["Tesla", "Waymo", "Cruise", "Rivian", "Nuro", "Zoox"]
vehicle_models = {
    "Tesla": ["Model S", "Model X", "Model 3", "Model Y", "Cybertruck"],
    "Waymo": ["Jaguar I-Pace", "RoboTaxi", "Firefly"],
    "Cruise": ["Bolt", "Origin"],
    "Rivian": ["R1T", "R1S"],
    "Nuro": ["R2", "Delivery"],
    "Zoox": ["Autonomous"]
}
autonomy_levels = ["Level 3", "Level 4", "Level 5"]
coverage_types = ["Basic", "Collision", "Comprehensive"]
genders = ["Male", "Female"]
countries = ["China", "Singapore"]
fraud_prob = [0, 1]  # 0 = No Fraud, 1 = Fraud

# Helper function to generate random data with fraud column
def generate_policy_data_with_fraud(num_records=1000):
    data = []
    for i in range(num_records):
        policy_id = 10000 + i
        policyholder_name = fake.name()
        age = random.randint(25, 70)
        gender = random.choice(genders)
        policy_start_date = pd.to_datetime(fake.date_this_year(before_today=True, after_today=False))
        policy_end_date = policy_start_date + pd.DateOffset(years=1)
        country = random.choice(countries)
        vehicle_make = random.choice(vehicle_makes)
        vehicle_model = random.choice(vehicle_models[vehicle_make])
        vehicle_year = random.randint(2019, 2023)
        autonomy_level = random.choice(autonomy_levels)
        vin = fake.bothify(text='1HGCM82633A######', letters='0123456789')
        coverage_type = random.choice(coverage_types)
        annual_premium = random.randint(800, 1500)
        deductible = random.randint(300, 700)
        claim_history = random.randint(0, 3)
        last_claim_date = pd.to_datetime(fake.date_this_year()) if claim_history > 0 else pd.NaT
        claim_amount = random.randint(0, 5000) if claim_history > 0 else 0
        safety_score = random.randint(60, 100)
        num_accidents = random.randint(0, 5)
        iot_monitoring = random.choice(["Yes", "No"])
        past_fraud_record = random.choice(fraud_prob)  # Adding fraud record column
        
        # Determine policy status
        current_date = pd.Timestamp.now()
        policy_status = "Active" if policy_end_date > current_date else random.choice(["Expired", "Canceled"])

        data.append([
            policy_id, policyholder_name, age, gender, policy_start_date, policy_end_date,
            country, vehicle_make, vehicle_model, vehicle_year, autonomy_level, vin,
            coverage_type, annual_premium, deductible, claim_history, last_claim_date,
            claim_amount, safety_score, num_accidents, iot_monitoring, past_fraud_record, policy_status
        ])
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=[
        "Policy_ID", "Policyholder_Name", "Age", "Gender", "Policy_Start_Date", "Policy_End_Date",
        "Country", "Vehicle_Make", "Vehicle_Model", "Vehicle_Year", "Autonomy_Level", "VIN",
        "Coverage_Type", "Annual_Premium", "Deductible", "Claim_History", "Last_Claim_Date",
        "Claim_Amount", "Safety_Score", "Num_Accidents", "IoT_Monitoring", "Past_Fraud_Record", "Policy_Status"
    ])
    
    return df

# Generate synthetic policy data
num_records = 1000
df_policy = generate_policy_data_with_fraud(num_records)

# Display the first few rows of the dataframe
print(df_policy.head())

# Save to CSV
# output_path = r"C:\Users\Soumya Haridas\Downloads\synthetic_policy_data_with_fraud.csv"
# df_policy.to_csv(output_path, index=False)

# print(f"CSV file saved as '{output_path}'")


   Policy_ID Policyholder_Name  Age  Gender Policy_Start_Date Policy_End_Date  \
0      10000      Norma Fisher   49  Female        2024-04-10      2025-04-10   
1      10001    Colleen Taylor   64  Female        2024-02-24      2025-02-24   
2      10002    Nicholas Nolan   60  Female        2024-09-22      2025-09-22   
3      10003        Susan Levy   45    Male        2024-07-04      2025-07-04   
4      10004         Ryan Page   44  Female        2024-09-16      2025-09-16   

     Country Vehicle_Make Vehicle_Model  Vehicle_Year  ... Annual_Premium  \
0      China       Cruise        Origin          2022  ...           1166   
1      China       Cruise          Bolt          2019  ...           1283   
2  Singapore         Nuro      Delivery          2019  ...            895   
3      China         Nuro            R2          2020  ...           1258   
4      China         Nuro      Delivery          2023  ...           1360   

  Deductible Claim_History  Last_Claim_Date  Claim

In [None]:
# ##OLD CODE THAT WORKS - RANDOMFOREST


# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import mean_squared_error

# # Load the synthetic policy data generated previously
# #df_policy = pd.read_csv(r"C:\Users\Soumya Haridas\Downloads\synthetic_policy_data_with_fraud.csv")

# # Step 2.1: Feature Engineering
# # Encode categorical variables
# label_encoders = {}
# for column in ["Gender", "Country", "Vehicle_Make", "Vehicle_Model", "Autonomy_Level", 
#                "Coverage_Type", "IoT_Monitoring", "Policy_Status"]:
#     le = LabelEncoder()
#     df_policy[column] = le.fit_transform(df_policy[column])
#     label_encoders[column] = le

# # Convert date columns to numeric (timestamp)
# df_policy['Policy_Start_Date'] = pd.to_datetime(df_policy['Policy_Start_Date']).astype('int64') // 10**9
# df_policy['Policy_End_Date'] = pd.to_datetime(df_policy['Policy_End_Date']).astype('int64') // 10**9

# # Handle 'Last_Claim_Date' column by replacing NaT with a default value before conversion
# df_policy['Last_Claim_Date'] = pd.to_datetime(df_policy['Last_Claim_Date'], errors='coerce')

# # Replace NaT with 0 (or a default timestamp, e.g., 0 for no claim history)
# df_policy['Last_Claim_Date'] = df_policy['Last_Claim_Date'].fillna(pd.Timestamp(0))
# df_policy['Last_Claim_Date'] = df_policy['Last_Claim_Date'].astype('int64') // 10**9

# # Select features and target for ML model
# features = ["Age", "Gender", "Country", "Vehicle_Make", "Vehicle_Model", "Vehicle_Year",
#             "Autonomy_Level", "Coverage_Type", "Annual_Premium", "Deductible", "Claim_History",
#             "Claim_Amount", "Safety_Score", "Num_Accidents", "IoT_Monitoring", "Past_Fraud_Record"]
# X = df_policy[features]
# y = df_policy["Safety_Score"]  # Using safety score as a proxy for initial ML-based risk score

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# # Step 2.2: Train an ML Model
# # Use Random Forest to predict risk score
# rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
# rf_model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = rf_model.predict(X_test)

# # Calculate the mean squared error for model evaluation
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error of the model: {mse:.4f}")

# # Step 2.3: Combine ML Risk Score and Rule-Based Criteria
# # Predict risk scores on the entire dataset
# df_policy["ML_Risk_Score"] = rf_model.predict(X)

# # Calculate dynamic risk score (DRS)
# def calculate_dynamic_risk_score(row):
#     risk_score = row["ML_Risk_Score"]
    
#     # Rule-based adjustments
#     if row["Num_Accidents"] >= 3:
#         risk_score += 10  # Increase risk for higher number of accidents
#     if row["Past_Fraud_Record"] == 1:
#         risk_score += 20  # Increase risk if there is a past fraud record

#     return risk_score

# df_policy["Dynamic_Risk_Score"] = df_policy.apply(calculate_dynamic_risk_score, axis=1)

# # Step 2.4: Categorize the Dynamic Risk Score
# def categorize_risk(drs):
#     if drs < 40:
#         return "LOW"
#     elif 40 <= drs < 60:
#         return "MEDIUM"
#     elif 60 <= drs < 70:
#         return "HIGH A"
#     elif 70 <= drs < 80:
#         return "HIGH B"
#     else:
#         return "VERY HIGH"

# df_policy["Risk_Category"] = df_policy["Dynamic_Risk_Score"].apply(categorize_risk)

# # Display the first few rows of the dataframe with DRS and Risk Category
# print(df_policy[["Policy_ID", "ML_Risk_Score", "Dynamic_Risk_Score", "Risk_Category"]].head())

# # Save the final dataset with Dynamic Risk Score
# output_path_final = r"C:\Users\Soumya Haridas\Downloads\policy_data_with_drs.csv"
# df_policy.to_csv(output_path_final, index=False)
# print(f"Final dataset with DRS saved as '{output_path_final}'")


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error



# Load and preprocess data (excluding 'Policy_Status' if it doesn't exist)
label_encoders = {}
for column in ["Gender", "Country", "Vehicle_Make", "Vehicle_Model", "Autonomy_Level", 
               "Coverage_Type", "IoT_Monitoring"]:
    le = LabelEncoder()
    df_policy[column] = le.fit_transform(df_policy[column])
    label_encoders[column] = le

# Continue with the existing preprocessing steps...
# Convert date columns to numeric (timestamp)
df_policy['Policy_Start_Date'] = pd.to_datetime(df_policy['Policy_Start_Date']).astype('int64') // 10**9
df_policy['Policy_End_Date'] = pd.to_datetime(df_policy['Policy_End_Date']).astype('int64') // 10**9
df_policy['Last_Claim_Date'] = pd.to_datetime(df_policy['Last_Claim_Date'], errors='coerce').fillna(pd.Timestamp(0))
df_policy['Last_Claim_Date'] = df_policy['Last_Claim_Date'].astype('int64') // 10**9

# Select features and target
features = ["Age", "Gender", "Country", "Vehicle_Make", "Vehicle_Model", "Vehicle_Year",
            "Autonomy_Level", "Coverage_Type", "Annual_Premium", "Deductible", "Claim_History",
            "Claim_Amount", "Safety_Score", "Num_Accidents", "IoT_Monitoring", "Past_Fraud_Record"]
X = df_policy[features]
y = df_policy["Safety_Score"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Continue with ensemble model creation...


### Step 2: Create an Ensemble Model

# Define base models
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=0)
lr_model = LinearRegression()

# Create a Voting Regressor
voting_regressor = VotingRegressor(estimators=[
    ('rf', rf_model), 
    ('gb', gb_model), 
    ('lr', lr_model)
])

# Create a Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('rf', rf_model), 
        ('gb', gb_model)
    ],
    final_estimator=LinearRegression()
)

# Fit the ensemble models on training data
voting_regressor.fit(X_train, y_train)
stacking_regressor.fit(X_train, y_train)

# Predict on the test set using the ensemble models
y_pred_voting = voting_regressor.predict(X_test)
y_pred_stacking = stacking_regressor.predict(X_test)

# Calculate mean squared error
mse_voting = mean_squared_error(y_test, y_pred_voting)
mse_stacking = mean_squared_error(y_test, y_pred_stacking)

print(f"Mean Squared Error of Voting Regressor: {mse_voting:.4f}")
print(f"Mean Squared Error of Stacking Regressor: {mse_stacking:.4f}")

### Step 3: Combine ML Risk Score and Rule-Based Criteria

# Use the Stacking Regressor's prediction as the ML risk score
df_policy["ML_Risk_Score"] = stacking_regressor.predict(X)

# Calculate dynamic risk score (DRS)
def calculate_dynamic_risk_score(row):
    risk_score = row["ML_Risk_Score"]
    # Rule-based adjustments
    if row["Num_Accidents"] >= 3:
        risk_score += 10
    if row["Past_Fraud_Record"] == 1:
        risk_score += 20
    return risk_score

df_policy["Dynamic_Risk_Score"] = df_policy.apply(calculate_dynamic_risk_score, axis=1)

# Categorize the Dynamic Risk Score
def categorize_risk(drs):
    if drs < 40:
        return "LOW"
    elif 40 <= drs < 60:
        return "MEDIUM"
    elif 60 <= drs < 70:
        return "HIGH A"
    elif 70 <= drs < 80:
        return "HIGH B"
    else:
        return "VERY HIGH"

df_policy["Risk_Category"] = df_policy["Dynamic_Risk_Score"].apply(categorize_risk)

# Display the final output
print(df_policy[["Policy_ID", "ML_Risk_Score", "Dynamic_Risk_Score", "Risk_Category"]].head())


Mean Squared Error of Voting Regressor: 0.0000
Mean Squared Error of Stacking Regressor: 0.0000
   Policy_ID  ML_Risk_Score  Dynamic_Risk_Score Risk_Category
0      10000      67.999857           67.999857        HIGH A
1      10001      82.000034           92.000034     VERY HIGH
2      10002      98.999914          108.999914     VERY HIGH
3      10003      79.999999           89.999999     VERY HIGH
4      10004      64.999839           94.999839     VERY HIGH


In [5]:
import pickle

# Save the Stacking Regressor as a .pkl file
with open('stacking_regressor_model.pkl', 'wb') as file:
    pickle.dump(stacking_regressor, file)

print("Model saved as 'stacking_regressor_model.pkl'")


Model saved as 'stacking_regressor_model.pkl'


   ML_Risk_Score  Dynamic_Risk_Score Risk_Category
0       85.00006            85.00006     VERY HIGH
