In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

#### Data reading

In [182]:
data_path = 'D:/My Works KG/Interview_preparation/Take_home_assessment/Data/Quality_Data/'

In [183]:
dfs = []

for file_name in os.listdir(data_path):
    if file_name.endswith('.csv') and '2019' not in file_name:
        file_path = os.path.join(data_path, file_name)
        
        df = pd.read_csv(file_path)
        
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        
        df = df.rename(columns={'calldrop_category' : 'call_drop_category','inout_travelling' : 'in_out_travelling'})

        df['file_name'] = file_name

        dfs.append(df)


df_1 = pd.concat(dfs, ignore_index=True)



In [184]:
df_1.head()

Unnamed: 0,operator,in_out_travelling,network_type,rating,call_drop_category,latitude,longitude,state_name,file_name
0,Vodafone,Outdoor,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv
1,Vodafone,Travelling,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv
2,Vodafone,Outdoor,2G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv
3,Vodafone,Indoor,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv
4,Vodafone,Travelling,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv


In [185]:
def rename_file(input_filename):
    match = re.search(r'MyCall_Data_([a-zA-Z]+)_(\d{4})\.csv', input_filename)

    if match:
        month, year = match.groups()
        new_filename = f'{month}_{input_filename.split("_")[0]}_{year}.csv'
        
        return new_filename
    else:
        return input_filename

In [186]:
df_1['file_name'] = df_1['file_name'].apply(rename_file)

In [187]:
def extract_month_year(file_name):
    match = re.match(r'(\w+)_(\w+)_(\d{4}).csv', file_name)
    if match:
        month, _, year = match.groups()
        return f"{month}_{year}"
    else:
        return None

In [188]:
df_1['month_year'] = df_1['file_name'].apply(extract_month_year)

In [189]:
df_1['year'] = df_1['month_year'].apply(lambda x : x.split('_')[1])

In [190]:
df_1['month'] = df_1['month_year'].apply(lambda x : x.split('_')[0])

#### Data preparation

In [191]:
df_1.head()

Unnamed: 0,operator,in_out_travelling,network_type,rating,call_drop_category,latitude,longitude,state_name,file_name,month_year,year,month
0,Vodafone,Outdoor,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv,April_2021,2021,April
1,Vodafone,Travelling,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv,April_2021,2021,April
2,Vodafone,Outdoor,2G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv,April_2021,2021,April
3,Vodafone,Indoor,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv,April_2021,2021,April
4,Vodafone,Travelling,3G,5,Satisfactory,-1.0,-1.0,,April_MyCall_2021.csv,April_2021,2021,April


In [192]:
df_1['network_type'].unique()

array(['3G', '2G', 'Unknown', '4G', nan], dtype=object)

Impute network type with Unknown

In [193]:
df_1['network_type'] = df_1['network_type'].fillna('Unknown')

In [194]:
df_1['network_type'].unique()

array(['3G', '2G', 'Unknown', '4G'], dtype=object)

Remove columns not useful to modelling 

In [195]:
cols_to_remove = ['rating','state_name','file_name','month_year','year']

In [196]:
df_1 = df_1.drop(columns=cols_to_remove)

In [197]:
df_1.head()

Unnamed: 0,operator,in_out_travelling,network_type,call_drop_category,latitude,longitude,month
0,Vodafone,Outdoor,3G,Satisfactory,-1.0,-1.0,April
1,Vodafone,Travelling,3G,Satisfactory,-1.0,-1.0,April
2,Vodafone,Outdoor,2G,Satisfactory,-1.0,-1.0,April
3,Vodafone,Indoor,3G,Satisfactory,-1.0,-1.0,April
4,Vodafone,Travelling,3G,Satisfactory,-1.0,-1.0,April


Categorical columns handling

In [198]:
df_1['operator'].unique()

array(['Vodafone', 'Airtel', 'BSNL', 'RJio', 'Idea', 'MTNL', 'VI',
       'Other', 'Tata', 'Telenor'], dtype=object)

In [199]:
op_levels = df_1['operator'].value_counts()[:5].index.tolist()

In [200]:
df_1['operator'] = df_1['operator'].apply(lambda x : x if x in op_levels else 'Other')

In [201]:
df_1['operator'].value_counts()

RJio        67515
Airtel      56278
Idea        33637
Vodafone    33136
BSNL        17992
Other       10376
Name: operator, dtype: int64

In [202]:
cols_to_dummy = ['operator','in_out_travelling','network_type','month']

In [203]:
dummy_df = pd.get_dummies(df_1[cols_to_dummy])

In [204]:
dummy_df.head()

Unnamed: 0,operator_Airtel,operator_BSNL,operator_Idea,operator_Other,operator_RJio,operator_Vodafone,in_out_travelling_Indoor,in_out_travelling_Outdoor,in_out_travelling_Travelling,network_type_2G,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
df_1 = pd.concat([df_1, dummy_df],axis=1)

In [206]:
df_1.head()

Unnamed: 0,operator,in_out_travelling,network_type,call_drop_category,latitude,longitude,month,operator_Airtel,operator_BSNL,operator_Idea,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,Vodafone,Outdoor,3G,Satisfactory,-1.0,-1.0,April,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Vodafone,Travelling,3G,Satisfactory,-1.0,-1.0,April,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Vodafone,Outdoor,2G,Satisfactory,-1.0,-1.0,April,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Vodafone,Indoor,3G,Satisfactory,-1.0,-1.0,April,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Vodafone,Travelling,3G,Satisfactory,-1.0,-1.0,April,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
df_1 = df_1.drop(columns=cols_to_dummy)

Call drop category will be our target variable. We will assume satisfactory as one class and combine call drop and poor voice quality into another class.

In [208]:
df_1['call_drop_category'].unique()

array(['Satisfactory', 'Call Dropped', 'Poor Voice Quality'], dtype=object)

In [209]:
df_1['voice_call_quality'] = df_1['call_drop_category'].map(lambda x : 0 if x == 'Satisfactory' else 1)

In [210]:
df_1['voice_call_quality'].value_counts()

0    149512
1     69422
Name: voice_call_quality, dtype: int64

In [211]:
df_1['call_drop_category'].value_counts()

Satisfactory          149512
Poor Voice Quality     46129
Call Dropped           23293
Name: call_drop_category, dtype: int64

In [212]:
df_1 = df_1.drop(columns='call_drop_category')

In [242]:
df_1.head()

Unnamed: 0,latitude,longitude,operator_Airtel,operator_BSNL,operator_Idea,operator_Other,operator_RJio,operator_Vodafone,in_out_travelling_Indoor,in_out_travelling_Outdoor,...,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September,voice_call_quality
0,-1.0,-1.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,-1.0,-1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.0,-1.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,-1.0,-1.0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.0,-1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Separate features and labels

In [213]:
X = df_1.drop(columns=['voice_call_quality'])

In [214]:
Y = df_1['voice_call_quality']

In [215]:
from sklearn.model_selection import train_test_split

In [227]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=355)

### Decision Tree classifier

In [231]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score

In [232]:
dt_cls = DecisionTreeClassifier(max_depth=3)
dt_cls.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=3)

#### Evaluating model performance

In [229]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [248]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(x_train)))
    print("Train Confusion Matrix:")
    c_mat_train = confusion_matrix(y_train, dt_classifier.predict(x_train))
    print(c_mat_train)
    print('Train Recall')
    print(c_mat_train[1, 1] / (c_mat_train[1, 1] + c_mat_train[1, 0]))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(x_test)))
    print("Test Confusion Matrix:")
    c_mat_test = confusion_matrix(y_test, dt_classifier.predict(x_test))
    print(c_mat_test)
    print('Test Recall')
    print(c_mat_test[1, 1] / (c_mat_test[1, 1] + c_mat_test[1, 0]))
    

In [249]:
evaluate_model(dt_cls)

Train Accuracy : 0.6939313833522698
Train Confusion Matrix:
[[106526  12975]
 [ 40632  15014]]
Train Recall
0.26981274485138196
--------------------------------------------------
Test Accuracy : 0.6953205289241099
Test Confusion Matrix:
[[26754  3257]
 [10084  3692]]
Test Recall
0.2680023228803717


1. For our problem statement, Recall is really important as correctly identfying voice quality issues so that neccessary steps can be taken to improve those issues.
2. For the basic model, although accuracy is 69%  recall is only 26% we need to improve it.
3. As our model is very basic we can try to increase depth of the model so that it can learn more and improve recall.

In [264]:
dt_cls2 = DecisionTreeClassifier(max_depth=5)
dt_cls2.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=5)

In [265]:
evaluate_model(dt_cls2)

Train Accuracy : 0.8033309163160088
Train Confusion Matrix:
[[108299  11202]
 [ 23244  32402]]
Train Recall
0.5822880350788916
--------------------------------------------------
Test Accuracy : 0.8013108913604494
Test Confusion Matrix:
[[27103  2908]
 [ 5792  7984]]
Test Recall
0.5795586527293844


1. So recall of the model goes up to 58%. This is better than earlier model. Even our accuracy is also improves to 80%.
2. We will try to tune other parameters and check the metrics. 

In [276]:
dt_cls3 = DecisionTreeClassifier(max_depth=5,min_samples_leaf=200,max_features=5,min_samples_split=100)
dt_cls3.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=5, max_features=5, min_samples_leaf=200,
                       min_samples_split=100)

In [277]:
evaluate_model(dt_cls3)

Train Accuracy : 0.7942014422171091
Train Confusion Matrix:
[[108985  10516]
 [ 25529  30117]]
Train Recall
0.541224885885778
--------------------------------------------------
Test Accuracy : 0.792883732614703
Test Confusion Matrix:
[[27296  2715]
 [ 6354  7422]]
Test Recall
0.5387630662020906


1. With tuning min samples leaf, min samples split and max_features, accuracy and recall is going down with compare to previous model.
2. But our model is not overfitting so we will further try to increase its depth and allow it to learn in depth.

In [292]:
dt_cls4 = DecisionTreeClassifier(max_depth=7,max_features=5,min_samples_leaf=200,min_samples_split=100)
dt_cls4.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=7, max_features=5, min_samples_leaf=200,
                       min_samples_split=100)

In [293]:
evaluate_model(dt_cls4)

Train Accuracy : 0.7977641638166797
Train Confusion Matrix:
[[105978  13523]
 [ 21898  33748]]
Train Recall
0.6064766560040254
--------------------------------------------------
Test Accuracy : 0.7950304885011533
Test Confusion Matrix:
[[26498  3513]
 [ 5462  8314]]
Test Recall
0.603513356562137


So, with the depth increased, we have got recall touching 60%. We will calculate feature importance to further understand the model

In [304]:
feature_importance = pd.DataFrame(zip(X.columns, dt_cls4.feature_importances_))
feature_importance = feature_importance.rename(columns={0 : 'Features', 1 : 'Weight'})
feature_importance.sort_values(by='Weight',ascending=False,inplace=True)

In [305]:
feature_importance

Unnamed: 0,Features,Weight
17,month_December,0.041875
0,latitude,0.039239
1,longitude,0.031278
4,operator_Idea,0.021186
8,in_out_travelling_Indoor,0.00395
7,operator_Vodafone,0.002717
20,month_July,0.001172
9,in_out_travelling_Outdoor,0.000699
18,month_February,0.000622
2,operator_Airtel,0.00046


#### As we have enough data available and there is complex relations present between feature and target variable we will try Random Forest 

### Random Forest Classifier

In [306]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [382]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=355)

In [310]:
rf_cls = RandomForestClassifier(n_estimators=50,max_depth=5)
rf_cls.fit(x_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=50)

In [311]:
evaluate_model(rf_cls)

Train Accuracy : 0.7073658127173175
Train Confusion Matrix:
[[118580    921]
 [ 50333   5313]]
Train Recall
0.09547856090285016
--------------------------------------------------
Test Accuracy : 0.7092516043574577
Test Confusion Matrix:
[[29769   242]
 [12489  1287]]
Test Recall
0.09342334494773519


The basic random forest model is performing worst than decision tree as par as recall is concerned.
We will try to tweak the parameters to get recall increased.

In [None]:
rf_cls2 = RandomForestClassifier(n_estimators=25,max_depth=20, min_samples_leaf=5, min_samples_split=5,max_features=10)
rf_cls2.fit(x_train, y_train)

In [368]:
evaluate_model(rf_cls2)

Train Accuracy : 0.8503085979206039
Train Confusion Matrix:
[[112876   6625]
 [ 19593  36053]]
Train Recall
0.6478992200697264
--------------------------------------------------
Test Accuracy : 0.8375316874871537
Test Confusion Matrix:
[[28060  1951]
 [ 5163  8613]]
Test Recall
0.6252177700348432


The model uses default cut off of 0.5 But optimum cut off where desired sensitivity, specificity and other metrics meet, can be chosen as model cut off.

Custom function has been built to calculate metrics at various cut offs and it will return a table containing metrics for each cutoff and by which we can chose best cut off for our problem statement.

In [357]:
def get_accuracy_metrics(X_df, Y_df, cut_off, model):
    try:
        X_df['model_probability'] = model.predict_proba(X_df)[:,1]
        df = X_df.join(Y_df)

        df.reset_index(inplace=True)


        tn, fp, fn, tp, sn, sp, ac, mean_ypred = [], [], [], [], [], [], [], []

        for i in range(len(cut_off)):
            df['Model prediction'] = df['model_probability'].apply(lambda x: 1 if x > cut_off[i] else 0)
            results = confusion_matrix(df['voice_call_quality'], df['Model prediction'])
            mean_ypred.append(round(np.mean(df[df['voice_call_quality'] != 1].loc[df['Model prediction'] == 1, 'model_probability']) * 100, 2))
            ac.append(round(accuracy_score(df['voice_call_quality'], df['Model prediction']) * 100, 2))
            tn.append(results[0][0])
            try:
                fp.append(results[0][1])
            except:
                fp.append(0)
            fn.append(results[1][0])
            tp.append(results[1][1])
            sn.append(round(float(tp[i]) / float(tp[i] + fn[i]) * 100, 2))
            sp.append(round(float(tn[i]) / float(tn[i] + fp[i]) * 100, 2))
            fpr, tpr, thresholds = roc_curve(df['voice_call_quality'], X_df['model_probability'])
            roc_auc = auc(fpr, tpr)
            GINI = (2 * roc_auc) - 1

        metrics_df = pd.DataFrame(
            {'probability cut offs': cut_off, 'Accuracy': ac, 'Sensitivity': sn, 'Specificity': sp,
             'True positive predictions': tp, 'True negative predictions': tn, 'False positive predictions': fp,
             'False negative predictions': fn, 'mean predicted probability': mean_ypred, 'gini': GINI})
        metrics_df = metrics_df[['gini', 'probability cut offs', 'Accuracy', 'Sensitivity', 'Specificity',
                                 'True positive predictions', 'True negative predictions',
                                 'False positive predictions', 'False negative predictions',
                                 'mean predicted probability']]
        return df, metrics_df
    except Exception as e:
        raise Exception()


In [369]:
cut_off = np.linspace(0, 1, 10)
train_df, train_metrics_df = get_accuracy_metrics(x_train,y_train, cut_off,rf_cls2)
train_metrics_df

Unnamed: 0,gini,probability cut offs,Accuracy,Sensitivity,Specificity,True positive predictions,True negative predictions,False positive predictions,False negative predictions,mean predicted probability
0,0.846276,0.0,33.23,100.0,2.14,55646,2556,116945,0,17.1
1,0.846276,0.111111,66.26,98.7,51.16,54923,61131,58370,723,30.4
2,0.846276,0.222222,76.48,93.53,68.53,52044,81900,37601,3602,38.0
3,0.846276,0.333333,83.31,81.91,83.96,45578,100331,19170,10068,47.84
4,0.846276,0.444444,85.02,71.92,91.11,40023,108881,10620,15623,55.52
5,0.846276,0.555556,84.58,59.94,96.05,33354,114778,4723,22292,63.61
6,0.846276,0.666667,82.53,47.18,98.99,26255,118292,1209,29391,75.15
7,0.846276,0.777778,80.25,38.54,99.67,21445,119111,390,34201,83.14
8,0.846276,0.888889,77.13,28.09,99.96,15632,119451,50,40014,91.96
9,0.846276,1.0,68.23,0.0,100.0,0,119501,0,55646,


In [370]:
cut_off = np.linspace(0, 1, 10)
train_df, train_metrics_df = get_accuracy_metrics(x_test,y_test, cut_off,rf_cls2)
train_metrics_df

Unnamed: 0,gini,probability cut offs,Accuracy,Sensitivity,Specificity,True positive predictions,True negative predictions,False positive predictions,False negative predictions,mean predicted probability
0,0.806705,0.0,32.94,100.0,2.16,13776,648,29363,0,17.98
1,0.806705,0.111111,64.8,97.16,49.94,13385,14987,15024,391,31.46
2,0.806705,0.222222,74.48,91.54,66.65,12610,20003,10008,1166,38.88
3,0.806705,0.333333,81.55,79.75,82.38,10986,24723,5288,2790,48.8
4,0.806705,0.444444,83.45,69.24,89.97,9539,27000,3011,4237,56.71
5,0.806705,0.555556,83.5,57.88,95.26,7973,28587,1424,5803,65.19
6,0.806705,0.666667,81.56,44.65,98.5,6151,29561,450,7625,76.83
7,0.806705,0.777778,79.64,36.59,99.4,5040,29832,179,8736,84.96
8,0.806705,0.888889,76.87,26.79,99.86,3691,29969,42,10085,93.48
9,0.806705,1.0,68.54,0.0,100.0,0,30011,0,13776,


With this high depth we are beating the decision tree's metrics. Although max depth of 20 is high and model is slighly overfitting we will reduce depth and check gini for train and validation dataset.

Here we have used GINI coefficient to validate our models performace. This GINI coefficient is measure of impurity in the dataset.

In [None]:
rf_cls3 = RandomForestClassifier(n_estimators=100,max_depth=12, min_samples_leaf=5, min_samples_split=5,max_features=10)
rf_cls3.fit(x_train, y_train)

In [373]:
evaluate_model(rf_cls3)

Train Accuracy : 0.8118894414406185
Train Confusion Matrix:
[[114917   4584]
 [ 28363  27283]]
Train Recall
0.49029579844013943
--------------------------------------------------
Test Accuracy : 0.8085504830200745
Test Confusion Matrix:
[[28733  1278]
 [ 7105  6671]]
Test Recall
0.4842479674796748


In [383]:
cut_off = np.linspace(0, 1, 25)
train_df, train_metrics_df = get_accuracy_metrics(x_train,y_train, cut_off,rf_cls3)
train_metrics_df

Unnamed: 0,gini,probability cut offs,Accuracy,Sensitivity,Specificity,True positive predictions,True negative predictions,False positive predictions,False negative predictions,mean predicted probability
0,0.770587,0.0,31.77,100.0,0.0,55646,0,119501,0,21.45
1,0.770587,0.041667,40.14,99.97,12.28,55632,14673,104828,14,24.27
2,0.770587,0.083333,45.76,99.55,20.71,55397,24751,94750,249,26.21
3,0.770587,0.125,54.97,98.52,34.7,54823,41462,78039,823,29.55
4,0.770587,0.166667,60.8,97.02,43.94,53986,52508,66993,1660,32.05
5,0.770587,0.208333,65.74,95.12,52.06,52932,62213,57288,2714,34.3
6,0.770587,0.25,70.35,92.58,60.0,51516,71696,47805,4130,36.59
7,0.770587,0.291667,74.57,88.03,68.3,48983,81621,37880,6663,39.04
8,0.770587,0.333333,79.25,79.67,79.05,44334,94468,25033,11312,43.09
9,0.770587,0.375,81.4,71.76,85.89,39933,102643,16858,15713,46.81


In [384]:
cut_off = np.linspace(0, 1, 25)
test_df, test_metrics_df = get_accuracy_metrics(x_test,y_test, cut_off,rf_cls3)
test_metrics_df

Unnamed: 0,gini,probability cut offs,Accuracy,Sensitivity,Specificity,True positive predictions,True negative predictions,False positive predictions,False negative predictions,mean predicted probability
0,0.754266,0.0,31.46,100.0,0.0,13776,0,30011,0,21.72
1,0.754266,0.041667,39.85,99.92,12.28,13765,3684,26327,11,24.57
2,0.754266,0.083333,45.42,99.45,20.63,13700,6190,23821,76,26.52
3,0.754266,0.125,54.57,98.3,34.5,13542,10354,19657,234,29.9
4,0.754266,0.166667,60.42,96.87,43.68,13345,13109,16902,431,32.42
5,0.754266,0.208333,65.27,94.94,51.64,13079,15499,14512,697,34.67
6,0.754266,0.25,69.79,91.96,59.61,12668,17889,12122,1108,37.02
7,0.754266,0.291667,73.76,87.09,67.65,11998,20301,9710,1778,39.45
8,0.754266,0.333333,78.24,78.48,78.14,10811,23450,6561,2965,43.42
9,0.754266,0.375,80.59,70.83,85.07,9757,25531,4480,4019,47.16


 Although recall at default cut off is lesser than decision tree, we will go with this model for two below reasons.
1. Decision trees are not reliable model for complex relations and it can overfit on unseen data.
2. For the optimum cutoff which can be between 0.25 to 0.29, as sensitivity is quite high and specificity is also acceptable,we are good to go with.

#### We will select 0.29 as optimum cut off for which we have 87% sensitivity(recall) and 67% specificity.

In [386]:
feature_importance = pd.DataFrame(zip(X.columns, rf_cls3.feature_importances_))
feature_importance = feature_importance.rename(columns={0 : 'Features', 1 : 'Weight'})
feature_importance.sort_values(by='Weight',ascending=False,inplace=True)
feature_importance

Unnamed: 0,Features,Weight
0,latitude,0.318335
1,longitude,0.307467
4,operator_Idea,0.052397
24,month_November,0.036108
6,operator_RJio,0.027116
17,month_December,0.026323
2,operator_Airtel,0.025547
20,month_July,0.023046
7,operator_Vodafone,0.022943
9,in_out_travelling_Outdoor,0.017766


1. The features Lattitude and longitude have highest feature weight, which is self explanatory. As location really matters for Voice Call Quality.
For remote locations or even rural locations voice call quality may be worse than that of urban locations.
2. The operator IDEA, Airtel, Rjio and months November, december has got good feature weights. We have observed this while deeply analysing our raw data.

We will train our final selected model on entire dataset and save pickle of it for further use

In [387]:
rf_cls3.fit(X, Y)

RandomForestClassifier(max_depth=12, max_features=10, min_samples_leaf=5,
                       min_samples_split=5)

In [390]:
model_path = 'D:\My Works KG\Interview_preparation\Take_home_assessment\Model'

In [391]:
model_to_save = (rf_cls3, X.columns.tolist())

In [393]:
import pickle

In [395]:
pickle.dump(model_to_save, open(model_path + 'final_model.pkl', 'wb'))
