# Exploratory Data Analysis (EDA)

## Import Dependenices and Custom Modules

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from scripts.data_loader import DataLoader
from scripts.data_analysis import DataAnalysis
from scripts.data_visualize import DataVisualize

In [25]:
# Loads a dataset from a text file and saves it as a CSV file.
dataLoader = DataLoader()
df = dataLoader.load_csv('../data/cleaned/ProcessedMachineLearningRating_v3.csv')

dataAnalysis = DataAnalysis(df)
dataVisualize = DataVisualize(df)

  data = pd.read_csv(csv_path)


Loaded data from ../data/cleaned/ProcessedMachineLearningRating_v3.csv


## Data Analysis

### Columns about the client

In [26]:
# List the columns overview for client information
client_columns = [ 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender']

In [27]:
# Fill '  ' values in 'Citizenship' with 'Not specified'
df.loc[(df['Citizenship'] == '  '), 'Citizenship'] = 'Not specified'

In [28]:
# Drop a column from the DataFrame since it has same value
df = dataAnalysis.drop_columns('Language')

In [29]:
# Fill missing values in 'Bank' &  'AccountType' with 'Not specified'
dataAnalysis.fill_with_not_specified('Bank')
print(f"Remaining missing bank values: {dataAnalysis.calculate_missing_value('Bank')}")

dataAnalysis.fill_with_not_specified('AccountType')
print(f"Remaining missing bank values: {dataAnalysis.calculate_missing_value('AccountType')}")

Remaining missing bank values: 0
Remaining missing bank values: 0


In [30]:
# Define title-maritalStatus mappings
title_maritalStatus_map = {
    'Mrs': 'Married',
    'Miss': 'Single'
}

missing_before = dataAnalysis.calculate_missing_value('MaritalStatus')
not_specified_before = dataAnalysis.length_column_value('MaritalStatus', 'Not specified')

# Apply mappings where Gender is missing/unspecified
for title, maritalStatus in title_maritalStatus_map.items():
    df.loc[(df['Title'] == title) & 
           (df['MaritalStatus'].isna() | (df['MaritalStatus'] == 'Not specified')), 
           'MaritalStatus'] = maritalStatus
    
    
missing_after = dataAnalysis.calculate_missing_value('MaritalStatus')
not_specified_after = dataAnalysis.length_column_value('MaritalStatus', 'Not specified')

print(f"Filled {missing_before - missing_after} missing maritalStatus values")
print(f"Filled {not_specified_before - not_specified_after} Not specified maritalStatus values")

# Fill remaining missing values in 'MaritalStatus' with 'Not specified'
dataAnalysis.fill_with_not_specified('MaritalStatus')

missing_after_filling = dataAnalysis.calculate_missing_value('MaritalStatus')
print(f"Filled {missing_after} missing maritalStatus values")
print(f"Remaining missing maritalStatus values: {missing_after_filling}")

Filled 0 missing maritalStatus values
Filled 51740 Not specified maritalStatus values
Filled 8259 missing maritalStatus values
Remaining missing maritalStatus values: 0


In [31]:
# Define title-gender mappings
title_gender_map = {
    'Mr': 'Male',
    'Mrs': 'Female',
    'Miss': 'Female',
    'Ms': 'Female'
}

missing_before = dataAnalysis.calculate_missing_value('Gender')
not_specified_before = dataAnalysis.length_column_value('Gender', 'Not specified')

# Apply mappings where Gender is missing/unspecified
for title, gender in title_gender_map.items():
    df.loc[(df['Title'] == title) & 
           (df['Gender'].isna() | (df['Gender'] == 'Not specified')), 
           'Gender'] = gender
    
missing_after = dataAnalysis.calculate_missing_value('Gender')
not_specified_after = dataAnalysis.length_column_value('Gender', 'Not specified')


print(f"Filled {missing_before - missing_after} missing gender values")
print(f"Filled {not_specified_before - not_specified_after} Not specified gender values")
print(f"Remaining missing gender values: {missing_after}")

Filled 9536 missing gender values
Filled 940180 Not specified gender values
Remaining missing gender values: 0


The **Client Information** columns are IsVATRegistered, Citizenship, LegalType, Title, Language,
       Bank, AccountType, MaritalStatus and Gender.
- **Citizenship**: '  ' filled with 'Not specified'
- **Language**: Dropped because it has the same value which is English.
- **AccountType**: Filled missing values with 'Not specified'
- **MaritalStatus**: Filled title with Mrs by Married and Miss by single and missing values with 'Not specified'
- **Gender**: Filled title with Mrs, Miss, Mr by Female and Mr by Male 

### Client Location

In [32]:
# List the columns overview for client location information
client_location_columns = [ 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone']

In [33]:
# Drop a column from the DataFrame since it has same value
df = dataAnalysis.drop_columns('Country')

The **Client Location** columns are 'Country', 'Province', 'PostalCode', 'MainCrestaZone' and 'SubCrestaZone'.
- **Country**: column was dropped because it has one unique value South Africa with **1** unique values so it was dropped.


### Car Insured

In [34]:
# List the columns overview for  car insured information
car_insured_columns1 = [ 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors']

In [35]:
df = dataAnalysis.drop_columns('ItemType')

In [36]:
# Remove any trailing space
df['make'] = df['make'].str.strip()
df['Model'] = df['Model'].str.strip()

df['make'].unique()

array(['MERCEDES-BENZ', 'VOLKSWAGEN', 'RENAULT', 'FORD', 'BMW', 'AUDI',
       'VOLVO', 'PROTON', 'TOYOTA', 'NISSAN/DATSUN', 'CMC', 'C.A.M',
       'POLARSUN', 'NISSAN', 'IVECO', 'FIAT', 'JINBEI', 'HYUNDAI',
       'GOLDEN JOURNEY', 'KIA', 'MITSUBISHI', 'TATA', 'MAZDA', 'CITROEN',
       'FOTON', 'B.A.W', 'PEUGEOT', 'SUZUKI', 'CHERY', 'SAIC', 'JMC',
       'HINO', 'HONDA', 'CHEVROLET', nan, 'OPEL', 'MAHINDRA', 'DAIHATSU',
       'GEELY', 'SCANIA', 'LEXUS', 'HUMMER', 'MARCOPOLO'], dtype=object)

In [37]:
cols_552 = [
    'mmcode', 'VehicleType', 'make', 'Model', 'Cylinders',
    'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate'
]

# Check if the same rows are missing
missing_together = df[cols_552].isnull().all(axis=1)
print("Number of rows with all 552-value columns missing:", missing_together.sum())

# Drop if there is any missing value
df = df.dropna(subset=cols_552)


Number of rows with all 552-value columns missing: 552


Half **Car Issued** columns are ''ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', and 'VehicleIntroDate'.
- **ItemType**: Dropped due to having one unique value.
- **make**: Removed trailing space.
- Droped rows which have  **552 missing values** in the same row because vechicle information is important.

In [None]:
# List the columns overview for  car insured information
car_insured_columns2 = [ 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet']


In [None]:
df['CustomValueEstimate'].fillna(df['CustomValueEstimate'].median(), inplace=True)
df[['WrittenOff', 'Rebuilt', 'Converted']] = df[['WrittenOff', 'Rebuilt', 'Converted']].fillna('Not specified')


In [69]:
print(df['NewVehicle'].value_counts(dropna=False))


NewVehicle
More than 6 months    845223
NaN                   153295
Less than 6 months      1580
Name: count, dtype: int64


In [72]:
print(df.groupby('NewVehicle')['RegistrationYear'].describe())


                     count unique         top    freq
NewVehicle                                           
Less than 6 months    1580      4  2014-01-01     825
More than 6 months  845223     25  2014-01-01  144957


In [73]:
print(df[df['NewVehicle'] == 'Less than 6 months']['RegistrationYear'].value_counts().sort_index())
print(df[df['NewVehicle'] == 'More than 6 months']['RegistrationYear'].value_counts().sort_index())


RegistrationYear
2007-01-01     50
2012-01-01     90
2014-01-01    825
2015-01-01    615
Name: count, dtype: int64
RegistrationYear
1987-01-01         3
1988-01-01         1
1992-01-01         1
1994-01-01        51
1995-01-01       344
1996-01-01       865
1997-01-01      1039
1998-01-01      1243
1999-01-01      1531
2000-01-01      1316
2001-01-01      5641
2002-01-01      5975
2003-01-01      9604
2004-01-01     15274
2005-01-01     27270
2006-01-01     37378
2007-01-01     72343
2008-01-01     86594
2009-01-01     54050
2010-01-01     77563
2011-01-01     69788
2012-01-01    106598
2013-01-01     98820
2014-01-01    144957
2015-01-01     26974
Name: count, dtype: int64


In [52]:
# First convert all to string (to safely apply str.replace)
df['CapitalOutstanding'] = df['CapitalOutstanding'].astype(str)

# Replace comma with dot
df['CapitalOutstanding'] = df['CapitalOutstanding'].str.replace(',', '.')

# Convert to numeric, coercing errors to NaN
df['CapitalOutstanding'] = pd.to_numeric(df['CapitalOutstanding'], errors='coerce')


In [42]:
# Drop a column from the DataFrame since it has nearly no value and no value
df = dataAnalysis.drop_columns(['CrossBorder', 'NumberOfVehiclesInFleet'])


The **Car Insured** columns are 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder' and 'NumberOfVehiclesInFleet'.
- **ItemType**: Mobility - Motor with **1** unique values which was dropped due to have one value.
- **Provice**: **9** unique values with Gauteng the highest count.
- The columns are **object** except **PostalCode** which is integer.
- 
- **Missing values** in columns: 
- All value is **missing** in column **NumberOfVehiclesInFleet** which was dropped.

In [43]:
# Save processed data to csv
csvfile = '../data/cleaned/CleanedMachineLearningRating_v3.csv'

df.to_csv(csvfile, index=False)
print(f"Data saved as {csvfile}")

Data saved as ../data/cleaned/CleanedMachineLearningRating_v3.csv
