In [1]:
import os 
os.chdir("../")

### Load the data

In [None]:
import pandas as pd
df = pd.read_csv("data/dataset.csv")

In [39]:
df = df.sample(frac=0.1, random_state=42)  # Take 10% of the data

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100010 entries, 57024 to 206822
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                100010 non-null  int64  
 1   UnderwrittenCoverID       100010 non-null  int64  
 2   PolicyID                  100010 non-null  int64  
 3   TransactionMonth          100010 non-null  object 
 4   IsVATRegistered           100010 non-null  bool   
 5   Citizenship               100010 non-null  object 
 6   LegalType                 100010 non-null  object 
 7   Title                     100010 non-null  object 
 8   Language                  100010 non-null  object 
 9   Bank                      85337 non-null   object 
 10  AccountType               95959 non-null   object 
 11  MaritalStatus             99192 non-null   object 
 12  Gender                    99077 non-null   object 
 13  Country                   100010 non-null  ob

In [26]:
import pandas as pd

# Load the downloaded postal code data
postal_data = pd.read_csv("data/ZA.txt", sep='\t', header=None, names=['CountryCode', 'PostalCode', 'PlaceName', 'Admin1Name', 'Admin1Code', 'Admin2Name', 'Admin2Code', 'Admin3Name', 'Admin3Code', 'Latitude', 'Longitude', 'Accuracy'])

# Create a mapping of postal codes to city names
postal_code_to_city = dict(zip(postal_data['PostalCode'], postal_data['PlaceName']))
# Add a new column in your dataset by mapping the postal code to city names
df['CityName'] = df['PostalCode'].map(postal_code_to_city)


### Selecting important columns

In [27]:
from scripts.data_preparation import feature_engineering


df = feature_engineering(df)

In [28]:

selected_columns = ['CityName','RiskFactor','VehicleAge', 'MainCrestaZone', 'SubCrestaZone', 'mmcode', 'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType','TotalPremium', 'TotalClaims']

df = df[selected_columns]

### Data Preparation: 

In [29]:
from scripts.preprocessing import  replace_missing_with_mean, replace_missing_with_mode
 


numerical_cols = df.select_dtypes(include=['int64', 'float64'])
categorical_cols = df.select_dtypes(include='object')

df = replace_missing_with_mode(df,categorical_cols)
df = replace_missing_with_mean(df,numerical_cols.columns)



In [31]:
from sklearn.preprocessing import MinMaxScaler


# Columns to scale
columns_to_scale = [
     'RiskFactor', 'VehicleAge', 'mmcode', 'Cylinders', 
    'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'CustomValueEstimate', 
    'CapitalOutstanding', 'SumInsured', 
    'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims'
]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


#### Encoding Categorical Data:
* Convert categorical data into a numeric format using one-hot encoding or label encoding to make it suitable for modeling.

In [32]:
from scripts.data_preparation import encode_categorical_data


df = encode_categorical_data(df)


#### Train-Test Split:
* Divide the data into a training set (for building the model) and a test set (for validating the model) using a 70:30 


In [33]:
from scripts.data_preparation import train_test_splitting


target_cols = ['TotalPremium', 'TotalClaims']
X_train, X_test, y_train, y_test = train_test_splitting(df, target_cols)

### Model Building
* Implement Linear Regression, Random Forests, and XGBoost models


In [None]:

from scripts.models import train_and_evaluate_models


results = train_and_evaluate_models(X_train, X_test, y_train['TotalPremium'], y_test['TotalPremium'])
    
# Print results for each model
for model_name, metrics in results.items():
    print(f"{model_name}: MSE = {metrics['MSE']}, R2 = {metrics['R2']}")