In [77]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

In [55]:
df = pd.read_csv('data/Training Data.csv')

In [56]:
df.columns

Index(['Id', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Risk_Flag'],
      dtype='object')

### Preparing Dataset

In [59]:
# Create Column Transformer with 3 types of transformers
num_features = df.select_dtypes(exclude="object").columns
cat_features = df.select_dtypes(include="object").columns

In [60]:
cat_features

Index(['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession',
       'CITY', 'STATE'],
      dtype='object')

In [61]:
# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

In [62]:
# Fit and transform the categorical data
df['marital_status'] = label_encoder.fit_transform(df['Married/Single'])
df['House_Ownership_encode'] = label_encoder.fit_transform(df['House_Ownership'])
df['Car_Ownership_encode'] = label_encoder.fit_transform(df['Car_Ownership'])
df['Profession_encode'] = label_encoder.fit_transform(df['Profession'])
df['CITY_encode'] = label_encoder.fit_transform(df['CITY'])
df['STATE_encode'] = label_encoder.fit_transform(df['STATE'])

In [63]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,marital_status,House_Ownership_encode,Car_Ownership_encode,Profession_encode,CITY_encode,STATE_encode
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0,1,2,0,33,251,13
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0,1,2,0,43,227,14
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0,0,2,0,47,8,12
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1,1,2,1,43,54,17
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1,1,2,0,11,296,22


In [64]:
# Instantiate the StandardScaler
scaler = StandardScaler()

In [65]:
# Fit and transform the numerical column
scaled_data = scaler.fit_transform(df[['Income']])
df['Scaled_income'] = scaled_data

In [66]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,marital_status,House_Ownership_encode,Car_Ownership_encode,Profession_encode,CITY_encode,STATE_encode,Scaled_income
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0,1,2,0,33,251,13,-1.283145
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0,1,2,0,43,227,14,0.895457
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0,0,2,0,47,8,12,-0.349269
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1,1,2,1,43,54,17,0.437526
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1,1,2,0,11,296,22,0.268128


In [67]:
df.columns

Index(['Id', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Risk_Flag', 'marital_status',
       'House_Ownership_encode', 'Car_Ownership_encode', 'Profession_encode',
       'CITY_encode', 'STATE_encode', 'Scaled_income'],
      dtype='object')

In [68]:
selected_columns = ['Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'marital_status',
       'House_Ownership_encode', 'Car_Ownership_encode', 'Profession_encode',
       'CITY_encode', 'STATE_encode', 'Scaled_income']

In [69]:
Y = df['Risk_Flag']
X = df[selected_columns]

In [70]:
X.shape

(252000, 11)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((201600, 11), (50400, 11))

### Model training and evaluation

In [72]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [93]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list = []
accuracy_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    precision_train = precision_score(y_train, y_train_pred)
    recall_train = recall_score(y_train, y_train_pred)
    f1_train = f1_score(y_train, y_train_pred)

    precision_test = precision_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    f1_test = f1_score(y_test, y_test_pred)
    accuracy_test = accuracy_score(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Precision: {:.4f}".format(precision_train))
    print("- Recall: {:.4f}".format(recall_train))
    print("- F1 score: {:.4f}".format(f1_train))
    print("- Accuracy Score: {:.4f}".format(accuracy_score(y_train, y_train_pred)))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Precision: {:.4f}".format(precision_test))
    print("- Recall: {:.4f}".format(recall_test))
    print("- F1 score: {:.4f}".format(f1_test))
    print("- Accuracy Score: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
    accuracy_list.append(accuracy_test)
    
    print('='*35)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression
Model performance for Training set
- Precision: 0.0000
- Recall: 0.0000
- F1 score: 0.0000
- Accuracy Score: 0.8773
----------------------------------
Model performance for Test set
- Precision: 0.0000
- Recall: 0.0000
- F1 score: 0.0000
- Accuracy Score: 0.8759


K-Neighbors Classifier
Model performance for Training set
- Precision: 0.6035
- Recall: 0.5269
- F1 score: 0.5626
- Accuracy Score: 0.8994
----------------------------------
Model performance for Test set
- Precision: 0.5520
- Recall: 0.4887
- F1 score: 0.5184
- Accuracy Score: 0.8874


Decision Tree
Model performance for Training set
- Precision: 0.7194
- Recall: 0.7900
- F1 score: 0.7531
- Accuracy Score: 0.9364
----------------------------------
Model performance for Test set
- Precision: 0.5148
- Recall: 0.5688
- F1 score: 0.5405
- Accuracy Score: 0.8800


Random Forest Classifier
Model performance for Training set
- Precision: 0.7138
- Recall: 0.8042
- F1 score: 0.7563
- Accuracy Score: 0.9364
-------

In [94]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
3,Random Forest Classifier,0.898651
5,CatBoosting Classifier,0.888591
1,K-Neighbors Classifier,0.887361
4,XGBClassifier,0.887242
2,Decision Tree,0.87998
6,AdaBoost Classifier,0.876032
0,Logistic Regression,0.875933
