# **1. Importing the Libraries**

In [933]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgbm
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# **2. Loading the Data**

In [934]:
train = pd.read_csv('/content/drive/MyDrive/Competitions/IndabaX Ghana AutoInland Vehicle Insurance Claim/IndabaX_Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Competitions/IndabaX Ghana AutoInland Vehicle Insurance Claim/IndabaX_Test.csv')
SS = pd.read_csv('/content/drive/MyDrive/Competitions/IndabaX Ghana AutoInland Vehicle Insurance Claim/SampleSubmission.csv')
StateName = pd.read_csv('/content/drive/MyDrive/Competitions/IndabaX Ghana AutoInland Vehicle Insurance Claim/NigerianStateNames.csv')

In [935]:
train.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1


# **3. Exploring and Transforming Data**

In [936]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202 entries, 0 to 1201
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      1202 non-null   object
 1   Policy Start Date       1202 non-null   object
 2   Policy End Date         1202 non-null   object
 3   Gender                  1161 non-null   object
 4   Age                     1202 non-null   int64 
 5   First Transaction Date  1202 non-null   object
 6   No_Pol                  1202 non-null   int64 
 7   Car_Category            830 non-null    object
 8   Subject_Car_Colour      505 non-null    object
 9   Subject_Car_Make        954 non-null    object
 10  LGA_Name                546 non-null    object
 11  State                   546 non-null    object
 12  ProductName             1202 non-null   object
dtypes: int64(2), object(11)
memory usage: 122.2+ KB


In [937]:
# Combine train and test set
ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe

all_data = pd.concat((train, test)).reset_index(drop=True)

In [938]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13281 entries, 0 to 13280
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      13281 non-null  object 
 1   Policy Start Date       13281 non-null  object 
 2   Policy End Date         13281 non-null  object 
 3   Gender                  12881 non-null  object 
 4   Age                     13281 non-null  int64  
 5   First Transaction Date  13281 non-null  object 
 6   No_Pol                  13281 non-null  int64  
 7   Car_Category            9171 non-null   object 
 8   Subject_Car_Colour      5622 non-null   object 
 9   Subject_Car_Make        10557 non-null  object 
 10  LGA_Name                6149 non-null   object 
 11  State                   6137 non-null   object 
 12  ProductName             13281 non-null  object 
 13  target                  12079 non-null  float64
dtypes: float64(1), int64(2), object(11)
me

In [939]:
all_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,13281.0,42.375574,93.170951,-6099.0,35.0,41.0,50.0,320.0
No_Pol,13281.0,1.302763,0.723195,1.0,1.0,1.0,1.0,10.0
target,12079.0,0.120457,0.325509,0.0,0.0,0.0,0.0,1.0


In [940]:
all_data.loc[all_data['Age']<10,'Age'] = np.NaN

In [941]:
all_data.loc[all_data['Age']>75,'Age'] = np.NaN

In [942]:
for col in all_data.columns:
  print(f'The number of unique values of {col}: {len(train[col].unique())}')

The number of unique values of ID: 12079
The number of unique values of Policy Start Date: 376
The number of unique values of Policy End Date: 372
The number of unique values of Gender: 8
The number of unique values of Age: 110
The number of unique values of First Transaction Date: 376
The number of unique values of No_Pol: 8
The number of unique values of Car_Category: 17
The number of unique values of Subject_Car_Colour: 46
The number of unique values of Subject_Car_Make: 75
The number of unique values of LGA_Name: 259
The number of unique values of State: 112
The number of unique values of ProductName: 9
The number of unique values of target: 2


In [943]:
all_data['Car_Category'].value_counts().to_frame()

Unnamed: 0,Car_Category
Saloon,6633
JEEP,2223
Truck,108
Bus,56
Mini Bus,45
Pick Up,32
Motorcycle,18
Sedan,14
Mini Van,13
Wagon,10


## **3.1. Dealing With LGA_Name and States**

In [944]:
all_data[(all_data.LGA_Name.isnull()) & (~all_data.State.isnull())]

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
9380,ID_S1O477F,2010-12-20,2011-12-19,Male,38.0,2010-12-20,1,,Black,TOYOTA,,N-A,Car Classic,0.0


In [945]:
all_data.loc[all_data['State']=='N-A','State'] = np.NaN

In [946]:
all_data[(~all_data.LGA_Name.isnull()) & (all_data.State.isnull())]

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
297,ID_0X9FT1B,2010-08-02,2011-01-23,Female,41.0,2010-08-02,1,Saloon,Black,TOYOTA,LGA,,Car Classic,0.0
568,ID_1S28LSA,2010-08-01,2011-07-31,Male,30.0,2010-08-01,1,Saloon,White,Honda,LGA,,Car Classic,0.0
1024,ID_3699IFV,2010-01-19,2011-01-18,Joint Gender,35.0,2010-01-19,1,,Silver,Lexus,LGA,,Car Plus,0.0
1168,ID_3LERA4V,2010-11-18,2011-11-17,Female,35.0,2010-11-18,1,,,,LGA,,Car Classic,0.0
1262,ID_3UVSEFO,2010-11-22,2011-11-21,Female,40.0,2010-11-22,2,,,,LGA,,Car Plus,0.0
2544,ID_7L9SS9R,2010-07-31,2011-07-30,Female,42.0,2010-07-31,1,Saloon,Grey,TOYOTA,LGA,,Car Classic,1.0
3583,ID_AO08COK,2010-04-12,2011-04-11,Male,52.0,2010-04-12,1,JEEP,Green,TOYOTA,LGA,,Car Classic,0.0
4717,ID_E63UTB8,2010-07-31,2011-07-30,Female,42.0,2010-07-31,1,Saloon,Grey,TOYOTA,LGA,,Car Classic,0.0
7321,ID_M0XED7B,2010-12-20,2011-12-19,Female,42.0,2010-12-20,1,Saloon,,TOYOTA,LGA,,CarSafe,0.0
7608,ID_MZK0QIU,2010-01-04,2011-01-03,Male,51.0,2010-01-04,1,JEEP,,Honda,LGA,,CarSafe,0.0


In [947]:
all_data.loc[all_data['LGA_Name']=='LGA','LGA_Name'] = np.NaN

In [948]:
StateName.head()

Unnamed: 0,LGA,State
0,Abadam,Borno State
1,Abaji,Federal Capital Territory
2,Abak,Akwa Ibom State
3,Abakaliki,Ebonyi State
4,Aba-North,Abia State


In [949]:
StateName['LGA'] = StateName.LGA.str.replace('--',' ').str.replace('-',' ').str.title()


In [950]:
all_data['LGA_Name'] = all_data.LGA_Name.str.replace('-',' ').str.replace('/',' ').str.title()
all_data['LGA_Name'] = all_data.LGA_Name.str.replace('.',' ').str.replace(',','').str.strip()


In [951]:
LGA_not_found = []
for lga_name in all_data.LGA_Name:
  if lga_name not in StateName.LGA.unique():
    LGA_not_found.append(lga_name)

len(list(set(LGA_not_found)))

14

In [952]:
list(set(LGA_not_found))

[nan,
 'Ifako Ijaye',
 'AjegunleLagos  State',
 'Ogudu',
 'Ilesha West',
 'Olamabolo',
 'Ogba',
 'Obia Akpor',
 'Ogbmosho South',
 'Somolu',
 'Calabar Municipality',
 'Ilesha East',
 'Ovia Southwest',
 'Bekwara']

In [953]:
for lga in StateName.LGA.unique():
  if  (type(lga) != float):
    if ('Shomolu' in lga):
      print(lga)

Shomolu


In [954]:
correct_LGA = {
               'Obia Akpor':'Obio Akpor',
               'Ogbmosho South':'Ogbomosho South',
               'Ovia Southwest':'Ovia South West',
               'Ovia Southwest':'Ovia South West',
               'Ilesha East':'Ilesa East',
               'Ilesha West':'Ilesa West',
               'Ogudu':'Ojota',
               'Olamabolo':'Olamaboro',
                'Calabar Municipality':'Calabar Municipal',
               'Calabar Municipality':'Calabar Municipal',
               'Ifako Ijaye':'Ifako Ijaiye',
               'AjegunleLagos  State':'Ajegunle Lagos State',
               'Bekwara':'Bekwarra',
               'Ogba':'Ogbadibo',
               'Somolu':'Shomolu'
               }

all_data['LGA_Name'] = all_data['LGA_Name'].replace(correct_LGA)

In [955]:
all_data['State'] = all_data['LGA_Name'].replace(dict(zip(StateName.LGA,StateName.State)))

##**3.2. Dealing With Subject_Car_Colour**

In [956]:
all_data.Subject_Car_Colour.value_counts().to_frame()

Unnamed: 0,Subject_Car_Colour
Black,2057
Silver,605
Grey,565
As Attached,555
Blue,398
White,321
Red,274
Green,259
Gold,192
Ash,142


In [957]:
modified_colors = {'Ash':'Grey',
                   'Wine':'Red',
                   'Cream':'White',
                   'Champagne':'Beige',
                   'Burgundy':'Red'}

all_data['Subject_Car_Colour'] = all_data.Subject_Car_Colour.replace(modified_colors)

In [958]:
Colors = ['Black','Silver','As Attached','Blue','Red','White',
          'Green','Gold','Brown','Yellow','Orange','Purple','Beige']
        
def which_color(x,color):
  if type(x) != float:  
    if color in x:
      return 1
    else:
      return 0

for Color in Colors:
  all_data['Car_Color_'+Color] = all_data['Subject_Car_Colour'].apply(lambda x: which_color(x,Color))

def Gray_Or_Grey(x):
  if type(x) != float:
    if ('Gray' in x) or ('Grey' in x):
      return 1
    else:
      return 0

all_data['Car_Color_Gray'] = all_data['Subject_Car_Colour'].apply(lambda x: Gray_Or_Grey(x))

In [959]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13281 entries, 0 to 13280
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      13281 non-null  object 
 1   Policy Start Date       13281 non-null  object 
 2   Policy End Date         13281 non-null  object 
 3   Gender                  12881 non-null  object 
 4   Age                     12223 non-null  float64
 5   First Transaction Date  13281 non-null  object 
 6   No_Pol                  13281 non-null  int64  
 7   Car_Category            9171 non-null   object 
 8   Subject_Car_Colour      5622 non-null   object 
 9   Subject_Car_Make        10557 non-null  object 
 10  LGA_Name                6136 non-null   object 
 11  State                   13281 non-null  object 
 12  ProductName             13281 non-null  object 
 13  target                  12079 non-null  float64
 14  Car_Color_Black         5622 non-null 

In [960]:
all_data.drop(['First Transaction Date'],axis=1,inplace=True)

In [961]:
# all_data['Subject_Car_Colour'].fillna('Unknown',inplace=True)
# all_data['LGA_Name'].fillna('Unknown',inplace= True)
# all_data['Car_Category'].fillna('Unknown',inplace = True)
# all_data['Subject_Car_Make'].fillna('Unknown', inplace = True)
# all_data['State'].fillna('Unknown', inplace = True)
# all_data['Gender'].fillna('Unknown', inplace = True)

In [962]:
all_data['Gender'].value_counts()

Male            8356
Female          3679
Entity           300
Joint Gender     238
NOT STATED       190
NO GENDER         76
SEX               42
Name: Gender, dtype: int64

In [963]:
all_data['Gender'] = all_data['Gender'].replace({'Entity': 'Other',
                                                'Joint Gender':'Other',
                                                'NOT STATED':'Other',
                                                'NO GENDER': 'Other',
                                                'SEX':'Other'})
all_data['Gender'].value_counts()

Male      8356
Female    3679
Other      846
Name: Gender, dtype: int64

In [964]:
date_cols = [col for col in all_data.columns if 'Date' in col]
num_cols = ['Age', 'No_Pol']
cat_cols = [col for col in all_data.columns if col not in date_cols+num_cols+['ID', 'target']]

for col in all_data.columns:
  if col in date_cols:
    all_data[col] = pd.to_datetime(all_data[col])
  elif col in cat_cols:
    all_data[col] = all_data[col].astype('category')

# Confirm whether the changes have been applied successfully
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13281 entries, 0 to 13280
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID                     13281 non-null  object        
 1   Policy Start Date      13281 non-null  datetime64[ns]
 2   Policy End Date        13281 non-null  datetime64[ns]
 3   Gender                 12881 non-null  category      
 4   Age                    12223 non-null  float64       
 5   No_Pol                 13281 non-null  int64         
 6   Car_Category           9171 non-null   category      
 7   Subject_Car_Colour     5622 non-null   category      
 8   Subject_Car_Make       10557 non-null  category      
 9   LGA_Name               6136 non-null   category      
 10  State                  13281 non-null  category      
 11  ProductName            13281 non-null  category      
 12  target                 12079 non-null  float64       
 13  C

In [965]:
# Fill in missing values
# For cat cols and date cols fill in with mode and for num cols fill in with 9999
for col in all_data.columns:
  if col in date_cols+cat_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
  elif col in num_cols:
    all_data[col] = all_data[col].fillna(all_data[col].fillna(round(all_data[col].mean())))

# Confirm that there aren't any missing values
all_data[all_data.columns.difference(['target'])].isna().sum()

Age                      0
Car_Category             0
Car_Color_As Attached    0
Car_Color_Beige          0
Car_Color_Black          0
Car_Color_Blue           0
Car_Color_Brown          0
Car_Color_Gold           0
Car_Color_Gray           0
Car_Color_Green          0
Car_Color_Orange         0
Car_Color_Purple         0
Car_Color_Red            0
Car_Color_Silver         0
Car_Color_White          0
Car_Color_Yellow         0
Gender                   0
ID                       0
LGA_Name                 0
No_Pol                   0
Policy End Date          0
Policy Start Date        0
ProductName              0
State                    0
Subject_Car_Colour       0
Subject_Car_Make         0
dtype: int64

In [966]:
for col in date_cols:
  for date_feature in ['day','dayofweek','dayofyear','year', 'month','quarter','weekday','weekofyear']:
    all_data[col+' '+date_feature] = getattr(all_data[col].dt, date_feature)
    # cat_cols.append(col+' '+date_feature) 
    
all_data.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Car_Color_Black,Car_Color_Silver,Car_Color_As Attached,Car_Color_Blue,Car_Color_Red,Car_Color_White,Car_Color_Green,Car_Color_Gold,Car_Color_Brown,Car_Color_Yellow,Car_Color_Orange,Car_Color_Purple,Car_Color_Beige,Car_Color_Gray,Policy Start Date day,Policy Start Date dayofweek,Policy Start Date dayofyear,Policy Start Date year,Policy Start Date month,Policy Start Date quarter,Policy Start Date weekday,Policy Start Date weekofyear,Policy End Date day,Policy End Date dayofweek,Policy End Date dayofyear,Policy End Date year,Policy End Date month,Policy End Date quarter,Policy End Date weekday,Policy End Date weekofyear
0,ID_0040R73,2010-05-14,2011-05-13,Male,30.0,1,Saloon,Black,TOYOTA,Victoria Island,Unknown,Car Classic,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,4,134,2010,5,2,4,19,13,4,133,2011,5,2,4,19
1,ID_0046BNK,2010-11-29,2011-11-28,Female,42.0,1,JEEP,Grey,TOYOTA,Victoria Island,Unknown,Car Classic,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29,0,333,2010,11,4,0,48,28,0,332,2011,11,4,0,48
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43.0,1,Saloon,Red,TOYOTA,Victoria Island,Unknown,Car Classic,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,6,80,2010,3,1,6,11,20,6,79,2011,3,1,6,11
3,ID_0079OHW,2010-08-21,2011-08-20,Male,42.0,1,Saloon,Black,TOYOTA,Victoria Island,Unknown,CarSafe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,5,233,2010,8,3,5,33,20,5,232,2011,8,3,5,33
4,ID_00BRP63,2010-08-29,2010-12-31,Other,20.0,3,Saloon,Black,TOYOTA,Lagos,State-Lagos,Muuve,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,6,241,2010,8,3,6,34,31,4,365,2010,12,4,4,52


In [967]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in cat_cols:
  all_data[col] = encoder.fit_transform(all_data[col])

In [968]:
# mapper_df = train_df.copy()
# for col in cat_cols:
#   train_df[col] = train_df[col].map(mapper_df.groupby(col)['target'].mean())
#   test_df[col] = train_df[col].map(mapper_df.groupby(col)['target'].mean()) 

# **4. Buinding Model**

In [969]:
# Separate train and test data from the combined dataframe
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]

# Check the shapes of the split dataset
train_df.shape, test_df.shape

((12079, 43), (1202, 43))

In [970]:
# Select main columns to be used in training
features = train_df.columns.difference(['ID','target','Policy End Date','Policy Start Date',
                                       'Policy End Date month','Policy Start Date month','Policy End Date year'])
                                 

X = train_df[features]
y = train_df.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=123)

In [971]:
train_df[train_df['target']==0].shape[0]/train_df.shape[0]

0.8795430085271959

In [972]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [974]:
lgbm = LGBMClassifier(n_estimators=800,
                       num_leaves=20,
                       min_child_samples=11,
                       class_weight={0:0.2, 1:0.8})

rf = RandomForestClassifier(n_estimators=100,
                               max_depth=16,
                               class_weight={0:0.14,1:0.86},
                               random_state=0)

xgb = XGBClassifier(n_estimators=1000,
                    max_depth=8,
                    scale_pos_weight=7)

model = VotingClassifier(estimators=[('lgbm',lgbm),('rf',rf),('xgb',xgb)])

In [975]:
w = 0.14

# model = LGBMClassifier(n_estimators=800,
#                        num_leaves=20,
#                        min_child_samples=11,
#                        class_weight={0:w, 1:1-w})                 

# model = RandomForestClassifier(n_estimators=100,
#                                max_depth=16,
#                                class_weight={0:w,1:1-w},
#                                random_state=0)

# Create a StratifiedKFold object
str_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

fold_metrics = []
n_fold = 1

for train_index, test_index in str_kf.split(X_train,y_train):
  cv_train, cv_test = X.iloc[train_index], X.iloc[test_index]
  y_train_cv, y_test_cv = y[train_index], y[test_index]


# Train a model
  model.fit(cv_train,y_train_cv)
# Make predictions
  predictions = model.predict(cv_test)
# Calculate the metric
  metric = f1_score(y_test_cv, predictions)
  fold_metrics.append(metric)
  print(f'fold {n_fold}: Done')
  n_fold += 1

np.mean(fold_metrics), np.std(fold_metrics)

fold 1: Done
fold 2: Done
fold 3: Done
fold 4: Done
fold 5: Done


(0.3171473034162367, 0.03304912629014588)

In [976]:
Local_val = round(np.mean(fold_metrics),4)

In [977]:
f1_score(y_test,model.predict(X_test))

0.7317073170731707

In [978]:
0.7335423197492164

0.7335423197492164

In [979]:
# ax=lgbm.plot_importance(model, figsize=(15,10))
# plt.show()

In [980]:
# Make prediction on the test set
test_df = test_df[features]
  
predictions = model.predict(test_df)

# Create a submission file
sub_file = SS.copy()
sub_file.target = predictions

In [981]:
# Create a csv file and upload to zindi 
sub_file.to_csv(f'IndabaX_Ghana_Sub({Local_val}).csv', index = False)