# Import

In [1]:
import sys
import os
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler



# Import custom file

In [2]:
import sys
sys.path.append(os.path.abspath('../scripts'))
import data_processing as dp
import model as m

# Data loading & cleaning

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
file_path = r"C:\Users\user\Desktop\Project\alpha-care-insurance\data\machine.csv"
df = pd.read_csv(file_path, sep="\t", engine="python")
print("Columns:", df.columns.tolist())


Columns: ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']


# missing value

In [13]:
df.isnull().sum()

UnderwrittenCoverID               0
PolicyID                      26716
TransactionMonth              26716
IsVATRegistered               26716
Citizenship                   24947
LegalType                     26716
Title                         26716
Language                      26716
Bank                         168236
AccountType                   65451
MaritalStatus                 34975
Gender                        36252
Country                       26716
Province                      26716
PostalCode                    26716
MainCrestaZone                26716
SubCrestaZone                 26716
ItemType                      26716
mmcode                        27268
VehicleType                   27268
RegistrationYear              26716
make                          27268
Model                         27268
Cylinders                     27268
cubiccapacity                 27268
kilowatts                     27268
bodytype                      27268
NumberOfDoors               

In [14]:
df.columns = df.columns.str.strip().str.replace('\n','').str.replace('\r','').str.replace(' ', '_')
print("Columns:", df.columns.tolist())
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], errors='coerce')
df = df.dropna(subset=['TransactionMonth'])

monthly = df.groupby(df['TransactionMonth'].dt.to_period('M')).agg({
    "TotalClaims": "sum",
    "TotalPremium": "sum"
}).reset_index()

monthly['TransactionMonth'] = monthly['TransactionMonth'].dt.to_timestamp()

Columns: ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']


# Encoding

In [22]:
df.columns = df.columns.str.strip().str.replace('\n','').str.replace('\r','').str.replace(' ', '_')


In [33]:
# CLEAN COLUMN NAMES
df.columns = (
    df.columns
    .str.strip()
    .str.replace('\n', '')
    .str.replace('\r', '')
    .str.replace(' ', '_')
)

# AUTO-DETECT CATEGORICAL COLUMNS
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# SPLIT INTO SMALL AND LARGE CATEGORICAL
small_cat = [col for col in categorical_cols if df[col].nunique() <= 50]
large_cat = [col for col in categorical_cols if df[col].nunique() > 50]

print("Small Category Columns:", small_cat)
print("Large Category Columns:", large_cat)

# LABEL ENCODE LARGE CARDINALITY COLUMNS
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_enc = df.copy()

for col in large_cat:
    df_enc[col] = le.fit_transform(df_enc[col].astype(str))

# ONE-HOT ENCODE SMALL COLUMNS
df_final = pd.get_dummies(df_enc, columns=small_cat, drop_first=True)

print("Encoding completed successfully!")
print("Final shape:", df_final.shape)


Small Category Columns: ['IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'VehicleType', 'make', 'bodytype', 'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'TermFrequency', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType']
Large Category Columns: ['UnderwrittenCoverID', 'Model', 'VehicleIntroDate']
Encoding completed successfully!
Final shape: (973382, 245)


# Scaling

In [35]:
columns_scaler = ['TotalClaims']
df_scaled = dp.scaler('minMaxScaler', df_label, columns_scaler)
df_scaled

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,31158,12827.0,2015-03-01,1,2,0,2,0,2,0,...,10,24,19,5,1,2,0,0,21.929825,0.029629
1,31158,12827.0,2015-05-01,1,2,0,2,0,2,0,...,10,24,19,5,1,2,0,0,21.929825,0.029629
2,31158,12827.0,2015-07-01,1,2,0,2,0,2,0,...,10,24,19,5,1,2,0,0,0.000000,0.029629
3,31161,12827.0,2015-05-01,1,2,0,2,0,2,0,...,1,16,12,5,1,2,0,0,512.848070,0.029629
4,31161,12827.0,2015-07-01,1,2,0,2,0,2,0,...,1,16,12,5,1,2,0,0,0.000000,0.029629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000093,78811,389.0,2015-04-01,0,4,1,2,0,0,1,...,12,21,17,5,1,1,0,0,347.235175,0.029629
1000094,78811,389.0,2015-06-01,0,4,1,2,0,0,1,...,12,21,17,5,1,1,0,0,347.235175,0.029629
1000095,78811,389.0,2015-08-01,0,4,1,2,0,0,1,...,12,21,17,5,1,1,0,0,347.235175,0.029629
1000096,78810,389.0,2014-07-01,0,4,1,2,0,0,1,...,12,17,13,5,1,1,0,0,2.315000,0.029629


# train model

In [52]:
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor(random_state=42)
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')

models = [lr_model, dt_model, rfr_model, xgb_model]
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost']

# Train & test

In [51]:
X = df_scaled.drop('TotalClaims', axis=1)
y = df_scaled['TotalClaims']

X_train, X_test, y_train, y_test = m.split_data(X, y)
X_train


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CalculatedPremiumPerTerm,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium
18489,52352,15781.0,2015-04-01,0,2,1,2,0,11,3,...,203.4992,12,7,6,6,0,1,0,0,0.000000
615208,78090,814.0,2014-06-01,0,0,1,2,0,11,0,...,3.4045,12,13,11,5,1,1,0,0,2.986404
19787,65098,21425.0,2015-05-01,0,2,1,2,0,11,3,...,1020.6811,7,14,12,5,1,1,0,0,317.699267
169564,97937,6392.0,2014-12-01,0,0,1,2,0,0,1,...,0.7728,12,19,15,5,1,1,0,0,0.677895
15136,56664,19181.0,2015-07-01,0,2,1,2,0,11,3,...,25.0000,11,25,19,5,1,1,0,0,21.929825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269398,64035,21289.0,2015-05-01,0,0,1,2,0,2,0,...,2.7555,12,19,15,5,1,1,0,0,1.247538
377644,29359,12450.0,2015-05-01,0,0,1,2,0,2,0,...,3.7909,12,13,11,5,1,1,0,0,3.325351
137046,85717,4655.0,2014-12-01,0,0,1,2,0,0,1,...,4.1442,12,13,11,5,1,1,0,0,3.635263
691813,58615,20414.0,2014-11-01,0,0,1,2,0,10,1,...,2.5097,12,17,13,5,1,1,0,0,2.201491


# Evalution

In [49]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor(random_state=42)
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')

models = [lr_model, dt_model, rfr_model, xgb_model]
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost']


    
    

In [66]:
df.columns = df.columns.str.strip().str.replace('\n','').str.replace('\r','').str.replace(' ', '_')


# Separate numeric and categorical columns


In [83]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [84]:
y = df['TotalClaims']


# test

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# train model

In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# Load your dataset
# -----------------------------
r"C:\Users\user\Desktop\Project\alpha-care-insurance\data\machine.csv"
print(df.head())

# -----------------------------
# Define target and features
# -----------------------------
target_column = 'TotalClaims'  
X = df.drop(columns=[TotalClaims])
y = df[TotalClaims]

# -----------------------------
# Identify categorical and numeric columns
# -----------------------------
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# -----------------------------
# Preprocessing: handle NaNs and encode
# -----------------------------
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_imputer, numeric_cols),
        ('cat', Pipeline([
            ('imputer', categorical_imputer),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

# -----------------------------
# Split data
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Define models
# -----------------------------
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')
}

# -----------------------------
# Train models and evaluate
# -----------------------------
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}
    print(f"{name} trained successfully.")

# -----------------------------
# Print evaluation results
# -----------------------------
for name, metrics in results.items():
    print(f"\n{name} Evaluation:")
    print(f" - MAE: {metrics['MAE']}")
    print(f" - MSE: {metrics['MSE']}")
    print(f" - R2: {metrics['R2']}")


NameError: name 'df' is not defined

In [None]:
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor(random_state=42)
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')

models = [lr_model, dt_model, rfr_model, xgb_model]
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost']
for model in models:
    model.fit(X_train, y_train)


# evaluate

In [None]:
mae_scores, mse_scores, r2_scores = [], [], []

for model in models:
    y_pred = model.predict(X_test)
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    mse_scores.append(mean_squared_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))

# Print results
for i, name in enumerate(model_names):
    print(f"Evaluation results for {name}:")
    print(f" - Mean Absolute Error (MAE): {mae_scores[i]:.4f}")
    print(f" - Mean Squared Error (MSE): {mse_scores[i]:.4f}")
    print(f" - R-squared (R2) Score: {r2_scores[i]:.4f}\n")

# bar chart 

In [None]:
m.plot_metrics(models, mae_scores, mse_scores, r2_scores)

# decision tree visualization

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plot_tree(decision_tree=dt_model, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# random forest visualization

In [None]:
tree_index = 0
single_tree = rfr_model.estimators_[tree_index]

plt.figure(figsize=(20, 10))
plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# demonstrates model diversity

In [None]:
# Shows first 3 trees → demonstrates model diversity.
for tree_index in range(3):
    single_tree = rfr_model.estimators_[tree_index]
    plt.figure(figsize=(20, 10))
    plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
    plt.title(f"Decision Tree {tree_index} from Random Forest")
    plt.show()


# feature 

In [None]:
def plot_feature_importance(model, feature_names, model_name):
    feature_importance = pd.DataFrame(model.feature_importances_, index=feature_names, columns=["Importance"])
    feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar', legend=False, color='skyblue')
    plt.title(f'Feature Importance for {model_name}')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xticks(rotation=45)
    plt.show()

# Compares which features matter most across different models

In [None]:
plot_feature_importance(dt_model, X_train.columns, "Decision Tree")
plot_feature_importance(rfr_model, X_train.columns, "Random Forest")
plot_feature_importance(xgb_model, X_train.columns, "XGBoost")

# Hyperparameter model learns.

In [None]:

from sklearn.model_selection import GridSearchCV
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

# RandomForestRegressor


In [None]:
# Set up the grid search
grid_search = GridSearchCV(estimator=rfr_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='r2')
# Fit the grid search model to the training data
grid_search.fit(X_train, y_train)
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best R-squared Score: {grid_search.best_score_}")

#  best parameter

In [None]:
# Train the model using the best parameters
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

best_rfr_model = grid_search.best_estimator_

# Fit the best model on the training data
best_rfr_model.fit(X_train, y_train)

# Make predictions and evaluate the performance
y_pred_best = best_rfr_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

# Print the results
print(f"MAE: {mae_best}, MSE: {mse_best}, R2: {r2_best}")


# Visualizes one decision tree inside the tuned Random Forest

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# PRINT DECISION RULES

In [None]:
from sklearn.tree import _tree

# Extract the best Random Forest model from GridSearchCV
best_rfr_model = grid_search.best_estimator_

# Extract a single decision tree from the Random Forest
tree_index = 0  # Index of the tree to visualize
single_tree = best_rfr_model.estimators_[tree_index]

# Function to display the decision tree criteria
def print_tree_criteria(tree, feature_names):
    # Access the tree structure
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print(f"Node {node}: {name} <= {threshold:.2f}")
            recurse(tree_.children_left[node])
            print(f"Node {node}: {name} > {threshold:.2f}")
            recurse(tree_.children_right[node])
        else:
            print(f"Leaf node {node}: Predicted value {tree_.value[node]}")

    recurse(0)  # Start from the root node

# Print the criteria for each split in the selected tree
print_tree_criteria(single_tree, X_train.columns)