In [1]:
import os 
os.chdir("../")

### Load the data

In [None]:
import pandas as pd
df = pd.read_csv("data/dataset.csv")

### Take 10% of the data

In [3]:
# cd = df
# df = df.sample(frac=0.1, random_state=42)  


In [4]:
import pandas as pd

# Load the downloaded postal code data
postal_data = pd.read_csv("data/ZA.txt", sep='\t', header=None, names=['CountryCode', 'PostalCode', 'PlaceName', 'Admin1Name', 'Admin1Code', 'Admin2Name', 'Admin2Code', 'Admin3Name', 'Admin3Code', 'Latitude', 'Longitude', 'Accuracy'])

# Create a mapping of postal codes to city names
postal_code_to_city = dict(zip(postal_data['PostalCode'], postal_data['PlaceName']))
# Add a new column in your dataset by mapping the postal code to city names
df['CityName'] = df['PostalCode'].map(postal_code_to_city)
value_counts = df['CityName'].value_counts()
cities_to_replace = value_counts[value_counts < 2000].index.tolist()
df['CityName'] = df['CityName'].apply(lambda x: 'Other' if x in cities_to_replace else x)


### Selecting important columns

In [5]:
from scripts.data_preparation import feature_engineering


df = feature_engineering(df)

In [6]:

selected_columns = ['CityName','RiskFactor','VehicleAge', 'MainCrestaZone', 'SubCrestaZone', 'mmcode', 'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType','TotalPremium', 'TotalClaims']

df = df[selected_columns]

### Data Preparation: 

In [7]:
from scripts.preprocessing import  replace_missing_with_mean, replace_missing_with_mode
 


numerical_cols = df.select_dtypes(include=['int64', 'float64'])
categorical_cols = df.select_dtypes(include='object')

df = replace_missing_with_mode(df,categorical_cols)
df = replace_missing_with_mean(df,numerical_cols.columns)



In [8]:
# Remove outliers
from scripts.preprocessing import remove_outliers


df = remove_outliers(df)




#### Encoding Categorical Data:
* Convert categorical data into a numeric format using one-hot encoding or label encoding to make it suitable for modeling.

In [9]:

selected_columns = ['CityName','RiskFactor','VehicleAge', 'Cylinders', 'kilowatts', 'CustomValueEstimate', 'CapitalOutstanding', 'NewVehicle', 'Rebuilt', 'Converted', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'Section', 'Product', 'StatutoryRiskType','TotalPremium', 'TotalClaims']

df = df[selected_columns]

In [10]:
from scripts.data_preparation import encode_categorical_data


df = encode_categorical_data(df)


#### Train-Test Split:
* Divide the data into a training set (for building the model) and a test set (for validating the model) using a 70:30 


In [11]:
from scripts.data_preparation import train_test_splitting


target_cols = ['TotalPremium', 'TotalClaims']
X_train, X_test, y_train, y_test = train_test_splitting(df, target_cols)

### Model Building
* Implement Linear Regression, Random Forests, and XGBoost models


In [None]:

# from scripts.models import train_and_evaluate_models


# results = train_and_evaluate_models(X_train, X_test, y_train['TotalPremium'], y_test['TotalPremium'])
# # Print results for each model
# for model_name, metrics in results.items():
#     print(f"{model_name}: MSE = {metrics['MSE']}, R2 = {metrics['R2']}")

In [37]:
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error, r2_score
import numpy as np
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model using Mean Squared Error and R-squared score.
    """

    y_pred = model.predict(X_test)

    # Convert one-hot encoded labels to class labels
    y_pred_labels = np.argmax(y_pred, axis=1)  # Predicted class labels
    y_true_labels = np.argmax(y_test, axis=1)     # True class labels

    # Now calculate the mse, r2, precision, recall, and F1 score with these class labels
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    precision = precision_score(y_true_labels, y_pred_labels, average='macro')
    recall = recall_score(y_true_labels, y_pred_labels, average='macro')
    f1 = f1_score(y_true_labels, y_pred_labels, average='macro')


    
    return mse, r2, precision, recall, f1

In [16]:
from scripts.models import build_linear_regression
lr_model = build_linear_regression(X_train, y_train)

In [20]:
y_pred_lr = lr_model.predict(X_test)
y_true = y_test

In [None]:
from sklearn.linear_model import LinearRegression

feature_names = ['CityName','RiskFactor','VehicleAge', 'Cylinders', 'kilowatts', 'CustomValueEstimate', 'CapitalOutstanding', 'NewVehicle', 'Rebuilt', 'Converted', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'Section', 'Product', 'StatutoryRiskType','TotalPremium', 'TotalClaims']

# Get the coefficients
coefficients = lr_model.coef_

# Print feature importance
for feature, coef in zip(feature_names, coefficients):
    print(f'{feature}: {coef}')


In [17]:
from scripts.models import build_decision_tree


dt_model= build_decision_tree(X_train, y_train)

In [24]:
y_pred_lr = lr_model.predict(X_test)
y_true = y_test

In [18]:
from scripts.models import build_random_forest


rf_model = build_random_forest(X_train, y_train)

In [None]:
y_pred_lr = lr_model.predict(X_test)
y_true = y_test

In [52]:
import matplotlib.pyplot import plt
# Example with Random Forest
importances = rf_model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

# Plot feature importance
plt.barh(np.array(feature_names)[sorted_indices], importances[sorted_indices])
plt.title('Feature Importance')
plt.show()


ImportError: cannot import name 'plt' from 'matplotlib.pyplot' (c:\Users\windows 10\Desktop\Insurance_Analatics\Insurance_Analytics\.venv\Lib\site-packages\matplotlib\pyplot.py)

In [19]:
from scripts.models import build_xgboost


xgb_model = build_xgboost(X_train, y_train)

In [31]:
y_pred_lr = lr_model.predict(X_test)
y_true = y_test

In [None]:
mse, r2, precision, recall, f1 = evaluate_model(xgb_model,X_test,y_test)
print(mse, r2, precision, recall, f1)

In [None]:
from scripts.visualize imort visualize
visualize()