
# Import Python libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.tree import DecisionTreeClassifier

# from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor

In [None]:
def get_state_dict():
    # This encoding is done based on per-capita-income-rating

    state_dict = { 33: "Andaman and Nicobar Islands",17 : "Andhra Pradesh",18 : "Arunachal Pradesh",28 : "Assam",
    32 : "Bihar",4 : "Chandigarh",25 : "Chhattisgarh",3 : "Delhi",1 : "Goa",10 : "Gujarat",5 : "Haryana",
    14 : "Himachal Pradesh",23 : "Jammu and Kashmir",30 : "Jharkhand",6 : "Karnataka",8 : "Kerala",26 : "Madhya Pradesh",
    12 : "Maharashtra",29 : "Manipur",27 : "Meghalaya",15 : "Mizoram",19 : "Nagaland",22 : "Odisha",7 : "Puducherry",
    16 : "Punjab",21 : "Rajasthan",2 : "Sikkim",13 : "Tamil Nadu",9 : "Telangana",20 : "Tripura",31 : "Uttar Pradesh",
    11 : "Uttarakhand",24 : "West Bengal" }
    
    state_dict = dict([(value, int(key)) for key, value in state_dict.items()])
    return state_dict


def get_cities_dict():
    # Ranking goes top to Bottom

    cities_list_ranked = ["Bengaluru","Pune","Ahmedabad","Chennai","Surat","Navi Mumbai","Coimbatore","Vadodara","Indore",
          "Greater Mumbai","Thane","Kalyan Dombivali","New Delhi","Noida","Ludhiana","Visakhapatnam","Pimpri Chinchwad",
    "Solapur","Raipur","Bhopal","Rajkot","Jodhpur","Madurai","Jaipur","Hyderabad","Nagpur","Lucknow","Varanasi",
    "Kanpur","Chandigarh","Ghaziabad","Gwalior","Prayagraj","Patna","Aurangabad","Agra","Meerut","Hubli Dharwad",
    "Nashik","Vasai Virar","Faridabad","Vijayawada","Ranchi","Jabalpur","Kota","Amritsar","Guwahati","Barielly",
    "Dhanbad","Srinagar" ]

    # The cities which are not present will be added at last, and one feature would be if they are present or not

    bool_city = list(np.ones((len(cities_list_ranked)),dtype = np.int64))
    
    for i in full_data.city.values:
        i = i.split("[")[0]
        if i not in cities_list_ranked:
            bool_city.append(0)
            cities_list_ranked.append(i)

    # Create a dictionary for Mapping

    cities_dict = {}
    for i,city in enumerate(cities_list_ranked):
        cities_dict[city] = i

    return cities_dict


In [None]:
def get_profess_dict(method):
    
    if method == 1 :
        # Based of Avg Salary
        
        profess_mean = []

        for i in full_data.profession.unique():
            inc = full_data[full_data.profession == i]["income"].mean()
            profess_mean.append([i,inc])

        profess_mean = np.array(profess_mean)[np.array(profess_mean)[:,1].argsort()]

        profess_dict = {}

        for i,prof in enumerate(profess_mean):
            profess_dict[prof[0]] = i
        
        return profess_dict
    
    elif method == 2 :
        # Based on risk flag

        profess_mean = []

        for i in full_data.profession.unique():
            inc = full_data[full_data.profession == i]["risk_flag"].sum()
            profess_mean.append([i,inc])

        profess_mean = np.array(profess_mean)[np.array(profess_mean)[:,1].argsort()]

        profess_dict = {}

        for i,prof in enumerate(profess_mean):
            profess_dict[prof[0]] = i

        return profess_dict

# Import Datasets

In [None]:
train_data = pd.read_csv("Training Data.csv")
test_data = pd.read_csv("Test Data.csv")
full_data = pd.concat([train_data,test_data],axis=0)

# test_data = train_data[:50000].reset_index(drop=True)
# train_data = train_data[50000:].reset_index(drop=True)

In [None]:
# Process profession --- >

profess = train_data.profession.unique()
new_profess = []

for i in profess:
    i = i.replace("_", " ")
    new_profess.append(i)
    
for i, j in zip(profess,new_profess):
    train_data.profession = train_data.profession.replace(i,j)
    
profess = test_data.profession.unique()
new_profess = []

for i in profess:
    i = i.replace("_", " ")
    new_profess.append(i)
    
for i, j in zip(profess,new_profess):
    test_data.profession = test_data.profession.replace(i,j)
    
for j,i in enumerate(train_data.city.values):
    train_data.city.values[j] = i.split("[")[0]
    
for j,i in enumerate(test_data.city.values):
    test_data.city.values[j] = i.split("[")[0]

In [None]:
# Process state  --- >

state = train_data.state.unique()
new_state = []

for i in state:
    i = i.replace("_", " ")
    new_state.append(i)
    
for i, j in zip(state,new_state):
    train_data.state = train_data.state.replace(i,j)
    
state = test_data.state.unique()
new_state = []

for i in state:
    i = i.replace("_", " ")
    new_state.append(i)
    
for i, j in zip(state,new_state):
    test_data.state = test_data.state.replace(i,j)

train_data.state = train_data.state.replace("Uttar Pradesh[5]","Uttar Pradesh")
test_data.state = test_data.state.replace("Uttar Pradesh[5]","Uttar Pradesh")

# Check how train dataset looks like.

In [None]:
train_data

# Convert datatype of selected fields.

In [None]:
profession_sort = 2 # 1 or 2

train_data.state = train_data.state.map(get_state_dict()).astype(np.int64)
train_data.city = train_data.city.map(get_cities_dict())
train_data.profession = train_data.profession.map(get_profess_dict(profession_sort))

test_data.state = test_data.state.map(get_state_dict())
test_data.city = test_data.city.map(get_cities_dict())
test_data.profession = test_data.profession.map(get_profess_dict(profession_sort)) 


train_data["married"]=pd.factorize(train_data.married)[0]
train_data["house_ownership"]=pd.factorize(train_data.house_ownership)[0]
train_data["car_ownership"]=pd.factorize(train_data.car_ownership)[0]

test_data["married"]=pd.factorize(test_data.married)[0]
test_data["house_ownership"]=pd.factorize(test_data.house_ownership)[0]
test_data["car_ownership"]=pd.factorize(test_data.car_ownership)[0]

# Drop the dependent variable from the train/test dataset. 

In [None]:
xtrain=train_data.drop("risk_flag",axis=1).drop("Id",axis=1)
ytrain=train_data["risk_flag"]
test_data=test_data.drop("id",axis=1)
# ytest=test_data["risk_flag"]

# Drop unnamed field from train and test dataset.

In [None]:
# xtrain_Unnamed  = xtrain.pop("Unnamed: 0")
# xtest_Unnamed  = xtest.pop("Unnamed: 0")
xtrain = np.array(xtrain)
# xtest  = np.array(xtest)

# Train your model

In [None]:
def train(model,xtrain,ytrain,xval,yval):
    model.fit(xtrain,ytrain,eval_set=(xval, yval),verbose_eval= True,use_best_model=True)
    y_pred = model.predict(xtrain)
    auc = roc_auc_score(ytrain, y_pred)
    print('Train ROC AUC: %f' % auc)
    y_pred = model.predict(xval)
    auc = roc_auc_score(yval, y_pred)
    print('Val ROC AUC: %f' % auc)
    return model,y_pred,auc

def get_pred(model,xtest):
    return model.predict(xtest)

In [None]:
folds = 5

y_test_oof = []
y_pred_oof = []

all_preds = {}

for itr,(train_index,test_index) in enumerate(skf(folds, random_state= 0, shuffle=True).split(xtrain,ytrain)):
    print(f"Train size {len(train_index)} | Val size {len(test_index)}")
    print()
    X_train, X_test = xtrain[train_index], xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]
    
    model = CatBoostRegressor(iterations=3000, depth=10, learning_rate=0.1 ,l2_leaf_reg=3, loss_function='RMSE',verbose=False)
    model,y_pred,auc = train(model,X_train,y_train,X_test,y_test)
    
    print(f"Doing Prediction for auc {auc}")
    all_preds[str(auc) + str(itr)] = get_pred(model,test_data)
    
    y_test_oof.extend(y_test)
    y_pred_oof.extend(y_pred)
    
    print()
    
oof = roc_auc_score(y_test_oof, y_pred_oof)
print('OOF ROC AUC: %f' % oof)

In [None]:
all_preds

In [None]:
"""
With Classifier :
OOF ROC AUC: 0.843004 : (iterations=15, depth=3, learning_rate=0.5, loss_function='CrossEntropy',verbose=False)
OOF ROC AUC: 0.874740 : (iterations=14, depth=3, learning_rate=0.5, loss_function='Logloss',verbose=False)

With Regressor :

OOF ROC AUC: 0.890554 : (iterations=1000, depth=5, learning_rate=1.0, loss_function='RMSE',verbose=False)
OOF ROC AUC: 0.930738 : (iterations=3000, depth=10, learning_rate=0.1 ,l2_leaf_reg=3, loss_function='RMSE',verbose=False)




"""

In [None]:
# RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object

In [None]:
get_pred(DTClassifier,test_data)

In [None]:
cols = np.array(train_data.columns)[1:-1]
impt = DTClassifier.feature_importances_

df = pd.DataFrame([cols,impt]).T.rename({0:"Feature",1:"Importance"},axis=1)

In [74]:
def rank_data(sub):
    sub = pd.DataFrame(base)[0]
    sub = sub.rank() / sub.rank().max()
    return sub

In [75]:
# grid = {'learning_rate': [0.03, 0.1,1.0],
#     'depth': [4, 6, 10],
#     'l2_leaf_reg': [1, 3, 5, 7, 9]}

# model = CatBoostRegressor(iterations=2000,loss_function='RMSE',verbose=False)
# grid_search_result = model.grid_search(grid, 
#                                    X=xtrain, 
#                                    y=ytrain, 
#                                    stratified = True,
#                                    cv = 5,
#                                    plot=True)

In [80]:
sample_sub = pd.read_csv("Sample Prediction Dataset.csv")
base = np.zeros(len(test_data))
for val in all_preds.values():
    base += np.array(val)

base/= folds
sample_sub

Unnamed: 0,id,risk_flag
0,1,0
1,2,0
2,3,1
3,4,0
4,5,0
...,...,...
27995,27996,0
27996,27997,1
27997,27998,0
27998,27999,0


In [83]:
sample_sub['risk_flag'] = (rank_data(base) >= 0.5)*1

In [84]:
sample_sub.to_csv("ranked.csv",index=False)

In [85]:
sample_sub

Unnamed: 0,id,risk_flag
0,1,0
1,2,1
2,3,0
3,4,1
4,5,0
...,...,...
27995,27996,0
27996,27997,0
27997,27998,0
27998,27999,1


In [66]:
rank_data(base)

0        0.418935
1        0.507519
2        0.118999
3        0.744356
4        0.149039
           ...   
27995    0.334959
27996    0.448225
27997    0.427615
27998    0.796721
27999    0.832351
Name: 0, Length: 28000, dtype: float64

0        0.007795
1        0.013241
2       -0.001260
3        0.164691
4       -0.000410
           ...   
27995    0.004356
27996    0.009124
27997    0.008128
27998    0.237952
27999    0.329370
Name: 0, Length: 28000, dtype: float64