
# Import Python libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression as lr

from sklearn.ensemble import GradientBoostingClassifier

In [2]:
def get_state_dict():
    # This encoding is done based on per-capita-income-rating

    state_dict = { 33: "Andaman and Nicobar Islands",17 : "Andhra Pradesh",18 : "Arunachal Pradesh",28 : "Assam",
    32 : "Bihar",4 : "Chandigarh",25 : "Chhattisgarh",3 : "Delhi",1 : "Goa",10 : "Gujarat",5 : "Haryana",
    14 : "Himachal Pradesh",23 : "Jammu and Kashmir",30 : "Jharkhand",6 : "Karnataka",8 : "Kerala",26 : "Madhya Pradesh",
    12 : "Maharashtra",29 : "Manipur",27 : "Meghalaya",15 : "Mizoram",19 : "Nagaland",22 : "Odisha",7 : "Puducherry",
    16 : "Punjab",21 : "Rajasthan",2 : "Sikkim",13 : "Tamil Nadu",9 : "Telangana",20 : "Tripura",31 : "Uttar Pradesh",
    11 : "Uttarakhand",24 : "West Bengal" }
    
    state_dict = dict([(value, int(key)) for key, value in state_dict.items()])
    return state_dict


def get_cities_dict():
    # Ranking goes top to Bottom

    cities_list_ranked = ["Bengaluru","Pune","Ahmedabad","Chennai","Surat","Navi Mumbai","Coimbatore","Vadodara","Indore",
          "Greater Mumbai","Thane","Kalyan Dombivali","New Delhi","Noida","Ludhiana","Visakhapatnam","Pimpri Chinchwad",
    "Solapur","Raipur","Bhopal","Rajkot","Jodhpur","Madurai","Jaipur","Hyderabad","Nagpur","Lucknow","Varanasi",
    "Kanpur","Chandigarh","Ghaziabad","Gwalior","Prayagraj","Patna","Aurangabad","Agra","Meerut","Hubli Dharwad",
    "Nashik","Vasai Virar","Faridabad","Vijayawada","Ranchi","Jabalpur","Kota","Amritsar","Guwahati","Barielly",
    "Dhanbad","Srinagar" ]

    # The cities which are not present will be added at last, and one feature would be if they are present or not

    bool_city = list(np.ones((len(cities_list_ranked)),dtype = np.int64))
    
    for i in full_data.city.values:
        i = i.split("[")[0]
        if i not in cities_list_ranked:
            bool_city.append(0)
            cities_list_ranked.append(i)

    # Create a dictionary for Mapping

    cities_dict = {}
    for i,city in enumerate(cities_list_ranked):
        cities_dict[city] = i

    return cities_dict


In [3]:
def get_profess_dict(method):
    
    if method == 1 :
        # Based of Avg Salary
        
        profess_mean = []

        for i in full_data.profession.unique():
            inc = full_data[full_data.profession == i]["income"].mean()
            profess_mean.append([i,inc])

        profess_mean = np.array(profess_mean)[np.array(profess_mean)[:,1].argsort()]

        profess_dict = {}

        for i,prof in enumerate(profess_mean):
            profess_dict[prof[0]] = i
        
        return profess_dict
    
    elif method == 2 :
        # Based on risk flag

        profess_mean = []

        for i in full_data.profession.unique():
            inc = full_data[full_data.profession == i]["risk_flag"].sum()
            profess_mean.append([i,inc])

        profess_mean = np.array(profess_mean)[np.array(profess_mean)[:,1].argsort()]

        profess_dict = {}

        for i,prof in enumerate(profess_mean):
            profess_dict[prof[0]] = i

        return profess_dict

# Import Datasets

In [4]:
train_data = pd.read_csv("Training Data.csv")
test_data = pd.read_csv("Test Data.csv")
full_data = pd.concat([train_data,test_data],axis=0)

# test_data = train_data[:50000].reset_index(drop=True)
# train_data = train_data[50000:].reset_index(drop=True)

In [5]:
# Process profession --- >

profess = train_data.profession.unique()
new_profess = []

for i in profess:
    i = i.replace("_", " ")
    new_profess.append(i)
    
for i, j in zip(profess,new_profess):
    train_data.profession = train_data.profession.replace(i,j)
    
profess = test_data.profession.unique()
new_profess = []

for i in profess:
    i = i.replace("_", " ")
    new_profess.append(i)
    
for i, j in zip(profess,new_profess):
    test_data.profession = test_data.profession.replace(i,j)
    
for j,i in enumerate(train_data.city.values):
    train_data.city.values[j] = i.split("[")[0]
    
for j,i in enumerate(test_data.city.values):
    test_data.city.values[j] = i.split("[")[0]

In [6]:
# Process state  --- >

state = train_data.state.unique()
new_state = []

for i in state:
    i = i.replace("_", " ")
    new_state.append(i)
    
for i, j in zip(state,new_state):
    train_data.state = train_data.state.replace(i,j)
    
state = test_data.state.unique()
new_state = []

for i in state:
    i = i.replace("_", " ")
    new_state.append(i)
    
for i, j in zip(state,new_state):
    test_data.state = test_data.state.replace(i,j)

train_data.state = train_data.state.replace("Uttar Pradesh[5]","Uttar Pradesh")
test_data.state = test_data.state.replace("Uttar Pradesh[5]","Uttar Pradesh")

# Check how train dataset looks like.

In [7]:
train_data

Unnamed: 0,Id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1,1303835,23,3,single,rented,no,Mechanical engineer,Rewa,Madhya Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil servant,Tiruchirappalli,Tamil Nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,251996,8154883,43,13,single,rented,no,Surgeon,Kolkata,West Bengal,6,11,0
251996,251997,2843572,26,10,single,rented,no,Army officer,Rewa,Madhya Pradesh,6,11,0
251997,251998,4522448,46,7,single,rented,no,Design Engineer,Kalyan-Dombivli,Maharashtra,7,12,0
251998,251999,6507128,45,0,single,rented,no,Graphic Designer,Pondicherry,Puducherry,0,10,0


# Convert datatype of selected fields.

In [8]:
profession_sort = 2 # 1 or 2

train_data.state = train_data.state.map(get_state_dict()).astype(np.int64)
train_data.city = train_data.city.map(get_cities_dict())
train_data.profession = train_data.profession.map(get_profess_dict(profession_sort))

test_data.state = test_data.state.map(get_state_dict())
test_data.city = test_data.city.map(get_cities_dict())
test_data.profession = test_data.profession.map(get_profess_dict(profession_sort)) 


train_data["married"]=pd.factorize(train_data.married)[0]
train_data["house_ownership"]=pd.factorize(train_data.house_ownership)[0]
train_data["car_ownership"]=pd.factorize(train_data.car_ownership)[0]

test_data["married"]=pd.factorize(test_data.married)[0]
test_data["house_ownership"]=pd.factorize(test_data.house_ownership)[0]
test_data["car_ownership"]=pd.factorize(test_data.car_ownership)[0]

# Drop the dependent variable from the train/test dataset. 

In [9]:
xtrain=train_data.drop("risk_flag",axis=1).drop("Id",axis=1)
ytrain=train_data["risk_flag"]
test_data=test_data.drop("id",axis=1)
# ytest=test_data["risk_flag"]

# Drop unnamed field from train and test dataset.

In [10]:
# xtrain_Unnamed  = xtrain.pop("Unnamed: 0")
# xtest_Unnamed  = xtest.pop("Unnamed: 0")
xtrain = np.array(xtrain)
# xtest  = np.array(xtest)

# Train your model

In [25]:
def train(model,xtrain,ytrain,xval,yval):
    model.fit(xtrain,ytrain)
    y_pred = model.predict_proba(xtrain)[:,1]
    auc = roc_auc_score(ytrain,y_pred)
    print('Train ROC AUC: %f' % auc)
    y_pred = model.predict_proba(xval)[:,1]
    auc = roc_auc_score(yval,y_pred)
    print('Val ROC AUC: %f' % auc)
    return model,y_pred

def get_pred(model,xtest):
    return model.predict(xtest)

In [30]:
folds = 5

y_test_oof = []
y_pred_oof = []

for train_index,test_index in skf(folds, random_state= 0, shuffle=True).split(xtrain,ytrain):
    print(f"Train size {len(train_index)} | Val size {len(test_index)}")
    print()
    X_train, X_test = xtrain[train_index], xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]
    
    DTClassifier = lr(max_iter = 1000,C =1e-5 ,random_state=0)
    DTClassifier,y_pred = train(DTClassifier,X_train,y_train,X_test,y_test)
    
    y_test_oof.extend(y_test)
    y_pred_oof.extend(y_pred)
    
    print()
    
oof = roc_auc_score(y_test_oof, y_pred_oof)
print('OOF ROC AUC: %f' % oof)

Train size 201600 | Val size 50400

Train ROC AUC: 0.503859
Val ROC AUC: 0.498123

Train size 201600 | Val size 50400

Train ROC AUC: 0.503363
Val ROC AUC: 0.500085

Train size 201600 | Val size 50400

Train ROC AUC: 0.501798
Val ROC AUC: 0.506359

Train size 201600 | Val size 50400

Train ROC AUC: 0.501785
Val ROC AUC: 0.506367

Train size 201600 | Val size 50400

Train ROC AUC: 0.502739
Val ROC AUC: 0.502607

OOF ROC AUC: 0.502701


In [29]:
get_pred(DTClassifier,test_data)

array([0, 0, 0, ..., 0, 0, 0])

In [37]:
cols = np.array(train_data.columns)[1:-1]
impt = DTClassifier.feature_importances_

df = pd.DataFrame([cols,impt]).T.rename({0:"Feature",1:"Importance"},axis=1)

In [35]:
df

Unnamed: 0,Feature,Importance
0,income,0.214767
1,age,0.138036
2,experience,0.070886
3,married,0.015081
4,house_ownership,0.006309
5,car_ownership,0.021425
6,profession,0.145212
7,city,0.176877
8,state,0.080567
9,current_job_years,0.064787


In [38]:
df

Unnamed: 0,Feature,Importance
0,income,0.228955
1,age,0.134147
2,experience,0.072921
3,married,0.012425
4,house_ownership,0.008608
5,car_ownership,0.017705
6,profession,0.144256
7,city,0.173796
8,state,0.084212
9,current_job_years,0.064295
