1. Policyholder Information: This includes demographic details such as age,
gender, occupation, marital status, and geographical location.
2. Claim History: Information regarding past insurance claims, including claim
amounts, types of claims (e.g., medical, automobile), frequency of claims, and
claim durations.
3. Policy Details: Details about the insurance policies held by the policyholders,
such as coverage type, policy duration, premium amount, and deductibles.
4. Risk Factors: Variables indicating potential risk factors associated with
policyholders, such as credit score, driving record (for automobile insurance),
health status (for medical insurance), and property characteristics (for home
insurance).
5. External Factors: Factors external to the policyholders that may influence claim
likelihood, such as economic indicators, weather conditions, and regulatory
changes

## Data Preprocessing 

# Task 1: Data Cleaning and Initial Processing

In [None]:
#2nd test 

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.ensemble import IsolationForest
from mlxtend.frequent_patterns import apriori, association_rules
import plotly.express as px

# Load the dataset
df = pd.read_csv('data.csv')

# Function to extract numerical values from torque and power columns
def extract_number(value):
    if pd.isna(value):
        return value
    return float(value.split('Nm@')[0])

def extract_power(value):
    if pd.isna(value):
        return value
    return float(value.split('bhp@')[0])

# Extract INT?/FLOAT from String max_power
df['rpm'] = df['max_power'].str.extract(r'@(\d+)rpm').astype(int)
df["horse_power"] = df['max_power'].str.extract(r'(\d+\.\d+)').astype(float)

# Clean max_torque and max_power columns (Repeated due to code compatability)
df['max_torque'] = df['max_torque'].apply(extract_number)
df['max_power'] = df['max_power'].apply(extract_power)

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Identify boolean columns (those with Yes/No values)
boolean_columns = []
for column in df.columns:
    if df[column].dtype == 'object':  # Check if column is string type
        if set(df[column].unique()) == {'Yes', 'No'} or set(df[column].unique()) == {'No', 'Yes'}:
            boolean_columns.append(column)
print("Columns containing Yes/No values:", boolean_columns)

# Replace Yes/No with 1/0 for these columns
for column in boolean_columns:
    df[column] = df[column].map({'Yes': 1, 'No': 0})

continuous_features = ['vehicle_age', 'customer_age', 
                      'displacement', 'turning_radius', 'length', 'width', 'gross_weight',
                      'max_torque', 'max_power']  # Added max_torque and max_power here

scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

# Display summary statistics of standardized features
print("\nSummary statistics of standardized features:")
print(df[continuous_features].describe())

# Define categorical features - these are columns with text or categorical values
categorical_features = ['region_code', 'segment', 'model', 'fuel_type', 'engine_type', 
                       'airbags', 'rear_brakes_type', 'cylinder', 'transmission_type',
                       'steering_type']

# Encode categorical variables
for column in categorical_features:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# Display first few rows of processed dataset
print("\nFirst few rows of processed dataset:")
print(df.head())

# Basic statistics of the processed dataset
print("\nProcessed dataset statistics:")
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_insurance_claims.csv', index=False)
print("\nCleaned dataset saved successfully!")


# Task 2: Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Overall Claims Distribution
plt.figure(figsize=(10, 6))
claims_dist = df['claim_status'].value_counts()
plt.bar(claims_dist.index, claims_dist.values, color=['lightblue', 'lightcoral'])
plt.bar(claims_dist.index, claims_dist.values)
plt.title('Overall Distribution of Claims')
plt.xlabel('Claim Status (0: No Claim, 1: Claim)')
plt.ylabel('Count')
plt.text(0, claims_dist[0], f'{claims_dist[0]}', ha='center', va='bottom')
plt.text(1, claims_dist[1], f'{claims_dist[1]}', ha='center', va='bottom')
plt.grid(True, alpha=0.3)
plt.xticks([0, 1])
plt.show()

# Calculate claim rate percentage
claim_rate = (claims_dist[1] / len(df)) * 100
print(f"\nOverall Claim Rate: {claim_rate:.2f}%")

In [None]:
# Read original data to get segment names
original_df = pd.read_csv('data.csv')

# Get mapping of encoded values to original segment names
segment_mapping = dict(zip(df['segment'].unique(), original_df['segment'].unique()))
df['segment'] = df['segment'].map(segment_mapping)

# Claims by Vehicle Segment
segment_claims = pd.crosstab(df['segment'], df['claim_status'], normalize='index') * 100

plt.figure(figsize=(12, 6))
segment_claims[1].plot(kind='bar', color='lightblue')
plt.title('Claim Rate by Vehicle Segment', fontsize=12, pad=15)
plt.xlabel('Vehicle Segment')
plt.ylabel('Claim Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print the claim rates by segment for verification
print("\nClaim rates by vehicle segment:")
print(segment_claims[1].sort_values(ascending=False))

In [None]:
# Claims by Fuel Type
plt.figure(figsize=(10, 6))
fuel_claims = pd.crosstab(original_df['fuel_type'], original_df['claim_status'], normalize='index') * 100
fuel_claims[1].sort_values(ascending=False).plot(kind='bar', color='lightblue')
plt.title('Claim Rate by Fuel Type', fontsize=12, pad=15)
plt.xlabel('Fuel Type')
plt.ylabel('Claim Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print the claim rates by fuel type for verification
print("\nClaim rates by fuel type:")
print(fuel_claims[1].sort_values(ascending=False))

In [None]:
# Claims by Vehicle Age Groups 
original_df['vehicle_age_group'] = pd.qcut(original_df['vehicle_age'], q=5, labels=['Very New', 'New', 'Medium', 'Old', 'Very Old'])
plt.figure(figsize=(10, 6))
age_claims = pd.crosstab(original_df['vehicle_age_group'], original_df['claim_status'], normalize='index') * 100
age_claims[1].plot(kind='bar', color='lightblue')
plt.title('Claim Rate by Vehicle Age', fontsize=12, pad=15)
plt.xlabel('Vehicle Age Group')
plt.ylabel('Claim Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print the claim rates by vehicle age for verification
print("\nClaim rates by vehicle age groups:")
print(age_claims[1].sort_values(ascending=False))

In [None]:
# Safety Features Impact on Claims
safety_features = ['is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors', 
                 'is_parking_camera', 'is_front_fog_lights', 'is_brake_assist']

plt.figure(figsize=(12, 6))
safety_impact = pd.DataFrame()

for feature in safety_features:
   feature_claims = pd.crosstab(original_df[feature], original_df['claim_status'], normalize='index')[1] * 100
   safety_impact[feature] = feature_claims

safety_impact.transpose().plot(kind='bar', color=['purple', 'lightblue'])
plt.title('Claim Rate by Safety Features', fontsize=12, pad=15)
plt.xlabel('Safety Feature')
plt.ylabel('Claim Rate (%)')
plt.legend(['Without Feature', 'With Feature'])
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# First, let's identify numerical columns that we want to analyze
numerical_features = ['vehicle_age', 'customer_age', 'subscription_length', 'max_torque',
                      'max_power', 'displacement', 'turning_radius', 'length', 'width', 
                     'gross_weight', 'claim_status']

# Create correlation matrix
correlation_matrix = df[numerical_features].corr()

# Create heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, 
            annot=True,
            cmap='coolwarm',
            center=0,
            fmt='.2f',
            square=True)

plt.title('Correlation Matrix of Numerical Features with Claim Status', pad=20)
plt.tight_layout()
plt.show()

# Print the correlation with claim_status specifically
print("\nCorrelation with claim_status:")
claim_correlations = correlation_matrix['claim_status'].sort_values(ascending=False)
print(claim_correlations)

In [None]:
# Claims by Region Density
plt.figure(figsize=(10, 6))
region_claims = pd.crosstab(original_df['region_density'], original_df['claim_status'], normalize='index') * 100
region_claims[1].sort_index().plot(kind='bar', color='violet')
plt.title('Claim Rate by Region Density', fontsize=12, pad=15)
plt.xlabel('Region Density')
plt.ylabel('Claim Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print the claim rates by region for verification
print("\nClaim rates by region density:")
print(region_claims[1].sort_index())

In [None]:
# Claims by Region Code
plt.figure(figsize=(12, 6))
region_code_claims = pd.crosstab(df['region_code'], df['claim_status'], normalize='index') * 100
region_code_claims[1].sort_index().plot(kind='bar', color='violet')
plt.title('Claim Rate by Region Code', fontsize=12, pad=15)
plt.xlabel('Region Code')
plt.ylabel('Claim Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print the claim rates by region for verification
print("\nClaim rates by region density:")
print(region_code_claims[1].sort_index())

In [None]:
# Average claim rate for each combination of region_code and region_density
region_combined = pd.crosstab([df['region_code'], df['region_density']], 
                            df['claim_status'], normalize='index') * 100
print("\nClaim rates by Region Code and Density:")
print(region_combined[1].sort_values(ascending=False).head(10))

# Number of policies in each region
plt.figure(figsize=(12, 6))
region_size = df['region_code'].value_counts()
region_size.plot(kind='bar', color='violet')
plt.title('Number of Policies by Region Code', fontsize=12, pad=15)
plt.xlabel('Region Code')
plt.ylabel('Number of Policies')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Statistical summary
print("\nRegional Analysis Summary:")
print("-" * 50)

# Highest claim rate regions
print("\nTop 5 regions by claim rate:")
top_regions = region_code_claims[1].sort_values(ascending=False).head()
print(top_regions)

# Region density distribution
print("\nDistribution of policies across region densities:")
density_dist = df['region_density'].value_counts()
print(density_dist)

# Chi-square test for independence between region and claims
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df['region_code'], df['claim_status'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("\nChi-square test for independence between region and claims:")
print(f"Chi-square statistic: {chi2:.2f}")
print(f"p-value: {p_value:.4f}")
print("Interpretation: " + ("There is a significant relationship between region and claims" 
                          if p_value < 0.05 else 
                          "No significant relationship between region and claims"))

# Calculate and print the regional risk factors
risk_factors = pd.DataFrame({
    'claim_rate': region_code_claims[1],
    'policy_count': df['region_code'].value_counts(),
    'avg_density': df.groupby('region_code')['region_density'].mean()
}).sort_values('claim_rate', ascending=False)

print("\nRegional Risk Factor Analysis:")
print(risk_factors.head(10))

In [None]:
# Get unique original region codes from the original dataset
original_df = pd.read_csv('data.csv')

# Create a DataFrame showing the mapping
mapping_df = pd.DataFrame({
    'Original_Region_Code': original_df['region_code'].unique(),
    'Standardized_Value': df['region_code'].unique()
})

# Sort by original region code for better readability
mapping_df = mapping_df.sort_values('Original_Region_Code')

print("Mapping between Original and Standardized Region Codes:")
print(mapping_df)

In [None]:
# Print summary statistics for each analysis
print("\nSummary of Claims Analysis:")
print("-" * 50)

print("\nTop 3 fuel types by claim rate:")
print(fuel_claims[1].sort_values(ascending=False).head(3))

print("\nVehicle age group claim rates:")
print(age_claims[1].sort_values(ascending=False))

print("\nRegion density claim rates:")
print(region_claims[1].sort_values(ascending=False))

print("\nSafety features impact on claim rates:")
for feature in safety_features:
    feature_impact = pd.crosstab(df[feature], df['claim_status'], normalize='index')[1]
    reduction = feature_impact.iloc[0] - feature_impact.iloc[1]
    print(f"{feature}: {'Reduces' if reduction > 0 else 'Increases'} claim rate by {abs(reduction):.2f}%")

## Risk Segmentation

# Task 1: Customer Segmentation

In [None]:
# Generic Functions for future use

def CalculteOptimalClusters(data, max_clusters: int, sample_size: float, columns_names: list = None, print: bool = False):
    # Calculates sillhoute score by using KMeans clustering repeatedly and picking the most optimal number of clusters
    cluster_range = range(2, max_clusters)
    silhouette_scores = []

    if columns_names is not None:
        subset = data[columns_names]
    else:
        subset = data

    subset = subset.sample(frac=sample_size, random_state=42)  # Use a sample of the data for faster computation (VERY LONG IF FULL)
    for num_clusters in cluster_range:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(subset)
        silhouette_avg = silhouette_score(subset, cluster_labels)
        silhouette_scores.append(silhouette_avg)
        
    if print:   # If user wants to print the graph
        PlotSilhouetteScore(cluster_range, silhouette_scores)
    
    return cluster_range[silhouette_scores.index(max(silhouette_scores))]



def PlotSilhouetteScore(cluster_range, silhouette_scores):
    plt.figure(figsize=(10, 8))
    plt.plot(cluster_range, silhouette_scores, marker='o')
    plt.title('Silhouette Score Method for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.show()

def PlotClusters(data, x_col, y_col, cluster_col, title, xlabel, ylabel, palette='viridis'):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=data, x=x_col, y=y_col, hue=cluster_col, palette=palette)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(title=cluster_col)
    plt.show()


def CreateCluster(data, num_clusters, cluster_category: str, list_columns: list = None):
    if list_columns is None:
        list_columns = data.columns
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    data[cluster_category] = kmeans.fit_predict(data[list_columns])


No features that have a high correlation with claim_status

In [None]:
risk_profile = df[['vehicle_age', 'max_power', 'customer_age', 'region_code', 'region_density', 'subscription_length', 'claim_status',
                   'horse_power', 'rpm']]
correlation_matrix = risk_profile.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Diabetes Dataset')
plt.show()

In [None]:
# Used as a base cluster that takes into account all categories
num_clusters = CalculteOptimalClusters(risk_profile, 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'risk_cluster')

In [None]:
PlotClusters(risk_profile, 'customer_age', 'vehicle_age', 'risk_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Vehicle Age')

num_clusters = CalculteOptimalClusters(risk_profile[['customer_age', 'horse_power']], 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'ca-va_cluster', ['customer_age', 'vehicle_age'])
PlotClusters(risk_profile, 'customer_age', 'vehicle_age', 'ca-va_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Vehicle Age')

In [None]:
PlotClusters(risk_profile, 'customer_age', 'horse_power', 'risk_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Horse Power')

num_clusters = CalculteOptimalClusters(risk_profile[['customer_age', 'horse_power']], 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'ca-hp_cluster', ['customer_age', 'horse_power'])
PlotClusters(risk_profile, 'customer_age', 'horse_power', 'ca-hp_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Horse Power')

In [None]:
PlotClusters(risk_profile, 'customer_age', 'region_code', 'risk_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Region Code')

num_clusters = CalculteOptimalClusters(risk_profile[['customer_age', 'region_code']], 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'ca-rc_cluster', ['customer_age', 'region_code'])
PlotClusters(risk_profile, 'customer_age', 'region_code', 'ca-rc_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Region Code')

In [None]:
PlotClusters(risk_profile, 'customer_age', 'region_density', 'risk_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Region Density')

num_clusters = CalculteOptimalClusters(risk_profile[['customer_age', 'region_density']], 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'ca-rd_cluster', ['customer_age', 'region_density'])
PlotClusters(risk_profile, 'customer_age', 'region_density', 'ca-rd_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'Region Density')

In [None]:
PlotClusters(risk_profile, 'customer_age', 'rpm', 'risk_cluster', 'Clusters of Risk Profiles', 'Vehicle Age', 'RPM')

num_clusters = CalculteOptimalClusters(risk_profile[['customer_age', 'rpm']], 10, 0.25, print=True)
CreateCluster(risk_profile, num_clusters, 'ca-rpm_cluster', ['customer_age', 'rpm'])
PlotClusters(risk_profile, 'customer_age', 'rpm', 'ca-rpm_cluster', 'Clusters of Risk Profiles', 'Customer Age', 'RPM')

# Task 2: Anomaly Detection

In [None]:
data = df.copy()
safety_features = [ # Features of Interest
    'airbags',
    'is_parking_sensors', 'is_parking_camera', 'is_front_fog_lights',
    'is_central_locking', 'is_speed_alert', 'rear_brakes_type',
    'model', 'transmission_type', 'steering_type', 'is_brake_assist', 'is_power_steering',
    'ncap_rating'
]

for feature in safety_features: # Convert Yes/No to 1/0
    data[feature] = data[feature].replace({'Yes': 1, 'No': 0}).astype(int)

In [None]:
for feature in safety_features:
    data[feature] = data[feature].replace({'Yes': 1, 'No': 0}).astype(int)

# Result DF
anomaly_counts = pd.DataFrame(columns=['Feature', 'Anomaly_Count'])

for feature in safety_features:
    df_subset = data[[feature, 'claim_status']].copy()  # Iterates through each feature and the target variable claim_status
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_subset)

    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_subset['anomaly'] = iso_forest.fit_predict(X_scaled)
    
    anomaly_count = (df_subset['anomaly'] == -1).sum()  # -1 label indiciates anomalies
    
    new_row = pd.DataFrame({'Feature': [feature], 'Anomaly_Count': [anomaly_count]})
    anomaly_counts = pd.concat([anomaly_counts, new_row], ignore_index=True)

anomaly_counts = anomaly_counts.sort_values(by='Anomaly_Count', ascending=False)

# Display the top 3 features with the highest anomalies when compared to the target variable claim_status
top_3_features = anomaly_counts.head(3)
print("Top 3 Safety Features with Highest Anomalies:")
print(top_3_features)

In [None]:
top_5_features = anomaly_counts.sort_values(by='Anomaly_Count', ascending=False).head(5)
print("Top 5 Safety Features with Highest Anomalies:")
print(top_5_features)

plt.figure(figsize=(10, 6))
sns.barplot(x='Feature', y='Anomaly_Count', data=top_5_features, palette='viridis')
plt.title('Top 5 Safety Features with Highest Anomalies')
plt.xlabel('Safety Feature')
plt.ylabel('Number of Anomalies')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Preprocessing
data = df.copy()
data['region_code'] = data['region_code'].astype(int)

features = ['steering_type', 'is_parking_sensors', 'ncap_rating', 'model']
data = data.dropna(subset=features + ['region_code', 'claim_status'])

if data['is_parking_sensors'].dtype == object:      ## !!!!!!! REPLACE?
    data['is_parking_sensors'] = data['is_parking_sensors'].replace({'Yes': 1, 'No': 0}).astype(int)

categorical_features = ['steering_type', 'model']
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

data['ncap_rating'] = data['ncap_rating'].astype(float)

region_details = []

# Loop Through Features and Regions - Find amount of outlines per each combination
regions = data['region_code'].unique()
for feature in features:
    total_outliers = 0
    for region in regions:
        region_data = data[data['region_code'] == region]
        if len(region_data) < 5:
            continue
        X = region_data[[feature, 'claim_status']]
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        labels = dbscan.fit_predict(X_scaled)
        num_outliers = np.sum(labels == -1)
        total_outliers += num_outliers
        region_details.append({'Region': region, 'Feature': feature, 'Outliers': num_outliers})


region_feature_outliers = pd.DataFrame(region_details)
top_10_region_features = region_feature_outliers.sort_values(by='Outliers', ascending=False).head(10)
print("Top 10 Region-Feature Relationships with Highest Outliers:")
print(top_10_region_features)


# Step 5: Visualize Results
plt.figure(figsize=(12, 8))
sns.barplot(x='Region', y='Outliers', hue='Feature', data=top_10_region_features, palette='viridis')
plt.title('Top 10 Region-Feature Relationships with Highest Outliers')
plt.xlabel('Region')
plt.ylabel('Number of Outliers')
plt.legend(title='Feature')
plt.show()

## Predictive Modeling

# Task 1: Classification Model

In [None]:
'''
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import shap

# Select features for the model
features = ['vehicle_age', 'customer_age', 'subscription_length', 'max_torque',
           'max_power', 'displacement', 'turning_radius', 'length', 'width', 
           'gross_weight', 'region_code', 'region_density']

X = df[features]
y = df['claim_status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
'''

In [None]:
'''
# Create a balanced dataset
sampler = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train_scaled, y_train)

# Train the model with balanced data
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Set up early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Train with balanced data
history = model.fit(
    X_train_balanced, y_train_balanced,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
'''

In [None]:
'''
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot loss
ax1.plot(history.history['loss'], label='Training Loss')
ax1.plot(history.history['val_loss'], label='Validation Loss')
ax1.set_title('Model Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()

# Plot accuracy
ax2.plot(history.history['accuracy'], label='Training Accuracy')
ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax2.set_title('Model Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()

plt.tight_layout()
plt.show()
'''

In [None]:
'''
# Calculate SHAP values for feature importance
explainer = shap.DeepExplainer(model, X_train_scaled[:100])
shap_values = explainer.shap_values(X_train_scaled[:100])

# Calculate feature importance with proper reshaping
feature_importance = np.abs(shap_values[0]).mean(0)
feature_importance = feature_importance * np.ones(len(X_train.columns))

# Create DataFrame with lists instead of arrays
feature_importance_df = pd.DataFrame({
    'Feature': list(X_train.columns),
    'Importance': list(feature_importance)
})

# Sort values
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=feature_importance_df,
    x='Importance',
    y='Feature'
)
plt.title('Top 10 Most Important Features')
plt.xlabel('Mean |SHAP value|')
plt.tight_layout()
plt.show()
'''

In [None]:
'''
# Evaluate model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")
'''

# Task 2: Model Evaluation

In [None]:
'''
from sklearn.metrics import precision_recall_curve, roc_curve, auc, confusion_matrix

# Get model predictions on test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int)

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, label=f'PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()

# ROC Curve and AUC Score
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Print detailed metrics
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
'''

## Association

# Task 1: Association Rule Mining

In [None]:

# Binning
df['region_density_bin'] = pd.qcut(df['region_density'], q=3, labels=["Low", "Medium", "High"])
df['horse_power_bin'] = pd.qcut(df['horse_power'], q=3, labels=["LowHP", "MedHP", "HighHP"])
df['rpm_bin'] = pd.qcut(df['rpm'], q=3, labels=["LowRPM", "HighRPM"], duplicates='drop')
df['displacement_bin'] = pd.qcut(df['displacement'], q=3, labels=["SmallDisp", "MedDisp", "LargeDisp"])

cols_of_interest = [
    'region_density_bin', 'model', 'rear_brakes_type', 
    'transmission_type', 'horse_power_bin', 'rpm_bin', 
    'ncap_rating', 'displacement_bin', 'claim_status'
]
df_sub = df[cols_of_interest]

df_encoded = pd.get_dummies(df_sub, columns=[
    'region_density_bin', 'model', 'rear_brakes_type', 
    'transmission_type', 'horse_power_bin', 'rpm_bin', 
    'ncap_rating', 'displacement_bin', 'claim_status'
])

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

In [None]:
# Generating Rules
lift_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
confidence_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
support_rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)

lift_rules['antecedents'] = lift_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
lift_rules['consequents'] = lift_rules['consequents'].apply(lambda x: ', '.join(list(x)))

# Top 10 Rules by Lift
top_lift = lift_rules.sort_values(by='lift', ascending=False).head(10)
print("Top 10 Rules by Lift:")
display(top_lift[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Top 10 Rules by Confidence
top_confidence = confidence_rules.sort_values(by='confidence', ascending=False).head(10)
print("\nTop 10 Rules by Confidence:")
display(top_confidence[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Top 10 Rules by Support
top_support = support_rules.sort_values(by='support', ascending=False).head(10)
print("\nTop 10 Rules by Support:")
display(top_support[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Task 2: Sequential Pattern Analysis   

In [None]:
avg_claim_rate = df.groupby('subscription_length')['claim_status'].mean().reset_index()

plt.figure(figsize=(10, 6))
plt.plot(avg_claim_rate['subscription_length'], avg_claim_rate['claim_status'], marker='o')
plt.xlabel('Subscription Length (Years)')
plt.ylabel('Average Claim Rate')
plt.title('Claim Pattern by Subscription Length')
plt.grid(True)
plt.show()

df['subscription_length_bin'] = pd.cut(df['subscription_length'], bins=[0,1,2,3,4,5], labels=['0-1','1-2','2-3','3-4','4-5'])
avg_claim_rate_bin = df.groupby('subscription_length_bin')['claim_status'].mean().reset_index()

plt.bar(avg_claim_rate_bin['subscription_length_bin'], avg_claim_rate_bin['claim_status'])
plt.xlabel('Subscription Length (Years)')
plt.ylabel('Average Claim Rate')
plt.title('Claim Pattern by Binned Subscription Length')
plt.show()

High-Risk Feature Combinations
Taken from Analaysis of Association Rules with Respect to Claim Status
By Support:
    rear_brakes_type_1	& transmission_type_1	0.651642	
    transmission_type_1	rear_brakes_type_1	0.651642	
    rear_brakes_type_1	rpm_bin_HighRPM	0.626178
    transmission_type_1	rear_brakes_type_1	0.651642	
    rear_brakes_type_1	rpm_bin_HighRPM	0.626178	
    rpm_bin_HighRPM	rear_brakes_type_1	0.626178