# Logistic Classification & Feature Selection

In [1]:
# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

In [3]:
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
# For processing and maths
import numpy as np
import pandas as pd
import requests
import pickle
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
# Use our feature engineered file
df = pd.read_pickle('file_clean.pkl')

In [6]:
df.head(2)

Unnamed: 0,Customer.ID,Years.customer,Months.customer,Minutes.in.2018,Minutes.onnet,Minutes.offnet,Number.of.SMS,KBs.used,Total.Unique.Calls,Previous.provider,...,Customer.Age,Customer.Municipality,Latitude,Longitude,Province,LoginsSite.Last.Month,LoginsSite.Last.6Months,EndSubscription,Total.Call.centre.complaint.calls,MonthlySubscriptionPrice
0,ADF1259,3805,126.83,4091.616,1436.324929,2655.291071,81,3624.375,117,KPN,...,44,Utrecht,52.090737,5.12142,Utrecht,6,11,5,2,45.640142
1,ADF1192,2905,96.83,3179.28,1949.967368,1229.312632,101,551842.8331,106,Telfort,...,31,Hoogeveen,52.728616,6.4901,Drenthe,8,12,4,2,21.871447


### Variable to be dropped

In [7]:
df.drop(['Customer.ID'], axis=1, inplace=True)

In [8]:
df.drop(['Customer.Municipality'], axis=1, inplace=True)

In [9]:
df.shape

(1200, 19)

In [10]:
province_dummy=pd.get_dummies(df['Province'],prefix='Province').iloc[:,1:]

In [11]:
df=pd.concat([df,province_dummy],axis=1)

In [12]:
df.shape

(1200, 30)

In [13]:
df.drop(['Province'], axis=1, inplace=True)

In [14]:
#len(df['Customer.Municipality'].value_counts())

In [15]:
#municipality_dummy=pd.get_dummies(df['Customer.Municipality'],prefix='Customer.Municipality').iloc[:,1:]

In [16]:
#df=pd.concat([df,municipality_dummy],axis=1)

In [17]:
#df.shape

In [18]:
#df.drop(['Customer.Municipality'], axis=1, inplace=True)

In [19]:
len(df['Previous.provider'].value_counts())

7

In [20]:
provider_dummy=pd.get_dummies(df['Previous.provider'],prefix='Previous.provider').iloc[:,1:]

In [21]:
df=pd.concat([df,provider_dummy],axis=1)

In [22]:
df.drop(['Previous.provider'], axis=1, inplace=True)

In [23]:
df.shape

(1200, 34)

**Missing value**

In [24]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(df)



Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
    verbose=0)

In [25]:
# Checking rows now
def summary_missing(dataset):
    n_miss = dataset.isnull().sum()
    n_obs = dataset.shape[0]
    n_miss_per = n_miss/n_obs*100
    n_miss_tbl = pd.concat([n_miss, n_miss_per], axis = 1).sort_values(1, ascending = False).round(1)
    n_miss_tbl = n_miss_tbl[n_miss_tbl[1] != 0]
    print('No. of fields: ', dataset.shape[0])
    print('No. of missing fields: ', n_miss_tbl.shape[0])
    n_miss_tbl = n_miss_tbl.rename(columns = {0:'No. of mising Value', 1:'%age of missing Value'})
    return n_miss_tbl

In [26]:
summary_missing(df)

No. of fields:  1200
No. of missing fields:  0


Unnamed: 0,No. of mising Value,%age of missing Value


**Separate features**

In [27]:
df_tunning=df.copy()

In [28]:
df_labels=df['Churn.Status']

In [29]:
df_model = df.drop(['Churn.Status'], axis=1)

**Transform**

In [30]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(df_model) # Since we have separated our label i.e churn feature.So, all other values are in X

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [31]:
X.shape

(1200, 33)

In [32]:
type(X)

numpy.ndarray

**Train-test Split**

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df_labels, test_size=0.1660, random_state=0)

In [34]:
print(X_train.shape)
print(X_test.shape)

(1000, 33)
(200, 33)


In [35]:
print(y_train.shape)
print(y_test.shape)

(1000,)
(200,)


In [36]:
X_train

array([[-0.25824096, -0.24435615,  0.09660258, ...,  1.78768123,
        -0.33178797, -0.41733475],
       [-1.05125281, -1.02015563, -0.13554917, ...,  1.78768123,
        -0.33178797, -0.41733475],
       [-0.22066488, -0.2075884 ,  1.7133967 , ..., -0.55938384,
        -0.33178797,  2.39615797],
       ...,
       [-1.01759091, -0.98729445,  0.06936202, ..., -0.55938384,
        -0.33178797, -0.41733475],
       [-0.08210308, -0.07200734,  0.15002365, ..., -0.55938384,
         3.01397306, -0.41733475],
       [-0.85084705, -0.82413757,  0.26214971, ..., -0.55938384,
        -0.33178797,  2.39615797]])

**Logistic model**

In [37]:
import statsmodels.api as sm

In [39]:
logit= sm.Logit(y_train,sm.add_constant(X_train)).fit()

         Current function value: 0.537481
         Iterations: 35




In [40]:
logit.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.225
Dependent Variable:,Churn.Status,AIC:,1142.962
Date:,2019-06-18 12:42,BIC:,1309.8257
No. Observations:,1000,Log-Likelihood:,-537.48
Df Model:,33,LL-Null:,-693.15
Df Residuals:,966,LLR p-value:,5.0594e-47
Converged:,0.0000,Scale:,1.0
No. Iterations:,35.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,0.0038,0.2598,0.0145,0.9885,-0.5054,0.5129
x1,4.4006,28.9173,0.1522,0.8790,-52.2763,61.0775
x2,-4.7993,29.5531,-0.1624,0.8710,-62.7222,53.1236
x3,0.1244,5875358.0182,0.0000,1.0000,-11515489.9875,11515490.2363
x4,-0.4973,2501724.1467,-0.0000,1.0000,-4903289.7241,4903288.7295
x5,0.5511,4244340.6211,0.0000,1.0000,-8318754.2045,8318755.3066
x6,0.2598,0.1245,2.0872,0.0369,0.0158,0.5037
x7,-0.5603,0.1374,-4.0778,0.0000,-0.8296,-0.2910
x8,-0.5065,0.1190,-4.2575,0.0000,-0.7396,-0.2733


In [41]:
df_model.columns

Index(['Years.customer', 'Months.customer', 'Minutes.in.2018', 'Minutes.onnet',
       'Minutes.offnet', 'Number.of.SMS', 'KBs.used', 'Total.Unique.Calls',
       'Customer.Age', 'Latitude', 'Longitude', 'LoginsSite.Last.Month',
       'LoginsSite.Last.6Months', 'EndSubscription',
       'Total.Call.centre.complaint.calls', 'MonthlySubscriptionPrice',
       'Province_Flevoland', 'Province_Friesland', 'Province_Gelderland',
       'Province_Groningen', 'Province_Limburg', 'Province_Noord-Brabant',
       'Province_Noord-Holland', 'Province_Overijssel', 'Province_Utrecht',
       'Province_Zeeland', 'Province_Zuid-Holland', 'Previous.provider_Ben',
       'Previous.provider_KPN', 'Previous.provider_T-Mobile',
       'Previous.provider_TELE2', 'Previous.provider_Telfort',
       'Previous.provider_Vodafone'],
      dtype='object')

### Note

- X6,7,8,12,16,31,33 are significant variables. 

- They are 'Number.of.SMS', 'KBs.used','Total.Unique.Calls','LoginsSite.Last.Month', 'MonthlySubscriptionPrice',Previous.provider_TELE2,Previous.provider_Vodafone.

### VIF -Variance Inflation Factor

impact of collinearity among the variables in a regression model. The Variance Inflation Factor (VIF) is 1/Tolerance, it is always greater than or equal to 1. There is no formal VIF value for determining presence of multicollinearity. Values of VIF that **exceed 10 are often regarded as indicating multicollinearity, but in weaker models values above 2.5 may be a cause for concern**

In [42]:
# For each X, calculate VIF and save in dataframe
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train, i) for i in range(X.shape[1])]

In [43]:
vif["features"] = df_model.columns

In [44]:
vif

Unnamed: 0,VIF Factor,features
0,22.736534,Years.customer
1,22.692625,Months.customer
2,inf,Minutes.in.2018
3,inf,Minutes.onnet
4,inf,Minutes.offnet
5,1.479287,Number.of.SMS
6,1.130744,KBs.used
7,1.885022,Total.Unique.Calls
8,1.041419,Customer.Age
9,10.864666,Latitude


**Features that are in danger VIF value are years.customers, month.customer,Latitude, longitutde , Province_Zuid-Holland,Province_Noord-Holland,Province_Noord-Brabant,Province_Limburg,Province_Gelderland.**