# Importing Libraries and dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from IPython.display import display

import warnings

warnings.filterwarnings('ignore')

sys.path.append('..')

In [2]:
from scripts import dataframe as dfr
from scripts import matrix as mx
from scripts import regression as rgr

# Downloading the data

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [4]:
# !wget $data -O churn_data.csv

# Reading and refining the data

In [5]:
df = pd.read_csv('churn_data.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df =  dfr.data_frame_refining(df)
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


# Changing object type column to numeric

In [8]:
df.totalcharges.dtype

dtype('O')

In [9]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

# Watching the errors while changing to numeric and filling them with 0

In [10]:
df[tc.isna()][['customerid','totalcharges']]

In [None]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

In [None]:
df.isna().sum()

# Changing the target variables into integer

In [None]:
df.churn.head()

In [None]:
df.churn = (df.churn == 'yes').astype(int)

In [None]:
df.churn.dtype

# Train test splitting into three parts

In [None]:
df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=1)

In [None]:
df_train, df_val = train_test_split(df_full_train,test_size=0.25,random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# Calculating overall churn rate

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train.isna().sum()

In [None]:
df_full_train.churn.value_counts(normalize = True)

In [None]:
churn_rate = round((df_full_train.churn.mean()),2)
churn_rate

# Listing the categorical columns

In [None]:
cat_columns = list(df.dtypes[df.dtypes == 'object'].index)
cat_columns.remove('customerid')
cat_columns

In [None]:
df_full_train[cat_columns].nunique()

# Calculating risk factor and churn rate for each column

In [None]:
churn_rate_female = round((df_full_train[df_full_train.gender == 'female'].churn.mean()),2)
churn_rate_female

In [None]:
churn_rate_male = round((df_full_train[df_full_train.gender == 'male'].churn.mean()),2)
churn_rate_male

In [None]:
global_churn = round((df_full_train.churn.mean()),2)
global_churn

In [None]:
df_full_train.partner.value_counts()

In [None]:
churn_rate_partner_yes = round((df_full_train[df_full_train.partner == 'yes'].churn.mean()),2)
churn_rate_partner_yes

In [None]:
churn_rate_partner_no = round((df_full_train[df_full_train.partner == 'no'].churn.mean()),2)
churn_rate_partner_no

In [None]:
churn_rate_female/global_churn

In [None]:
churn_rate_male/global_churn

In [None]:
churn_rate_partner_yes/global_churn

In [None]:
churn_rate_partner_no/global_churn

In [None]:
dfr.display_risk_factor(df_full_train,cat_columns,global_churn)

# Calculating feature importance using mutual info score for the categorical columns

In [None]:
mutual_info_score(df_full_train.contract,df_full_train.churn)

In [None]:
mi = dfr.calculate_mut_score(df_full_train,cat_columns,True)
print(mi)

# Calculating feature importance using correlation for the numerical columns

In [None]:
df.tenure.max()

In [None]:
df.dtypes

In [None]:
numerical_cols = ['tenure','monthlycharges','totalcharges']
numerical_cols

In [None]:
df_full_train[numerical_cols].corrwith(df_full_train.churn).abs()

In [None]:
df_full_train[df_full_train['tenure'] <= 2].churn.mean()

In [None]:
df_full_train[(df_full_train['tenure'] >= 2) & (df_full_train['tenure'] < 12)].churn.mean()

In [None]:
df_full_train[df_full_train['tenure'] > 12].churn.mean()

# Adding the 'seniorcitizen' column into categorical column

In [None]:
cat_columns = cat_columns + ['seniorcitizen']

# Using dict vectorizing for changing the categorical columns into numerical

In [None]:
train_dicts = df_train[cat_columns + numerical_cols].to_dict(orient = 'records')
val_dicts = df_val[cat_columns + numerical_cols].to_dict(orient = 'records')
test_dicts = df_test[cat_columns + numerical_cols].to_dict(orient = 'records')

In [None]:
dv = DictVectorizer(sparse = False)

In [None]:
X_train = dv.fit_transform(train_dicts)

In [None]:
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [None]:
X_train.shape , X_val.shape , X_test.shape

# Examining sigmoid function and its graphs

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [None]:
z = np.linspace(-5,5,50)
z

In [None]:
sig_z = sigmoid(z)
sig_z

In [None]:
plt.plot(z,sigmoid(z))

# Model definition and training

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train,y_train)

# Finding out the bias and weights

In [None]:
model.coef_[0]

In [None]:
model.coef_.shape

In [None]:
model.intercept_[0] # This is the bias term

# Predicting using the model

In [None]:
model.predict(X_train[:5])

In [None]:
model.predict_proba(X_train[:5])

In [None]:
y_pred = model.predict_proba(X_val)[:,1]

In [None]:
churn_dec = (y_pred >=0.5)
churn_dec.astype(int)

In [None]:
(y_val == churn_dec).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['prob'] = y_pred
df_pred['pred'] = churn_dec.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.pred == df_pred.actual
df_pred

In [None]:
df_pred.correct.mean()

In [None]:
dict(zip(dv.get_feature_names(),model.coef_[0].round(4)))

# Taking sample of the dataframe and finding the bias and weights

In [None]:
df_train_small = df_train[['contract','tenure','monthlycharges']].to_dict(orient = 'records')
df_val_small = df_val[['contract','tenure','monthlycharges']].to_dict(orient = 'records')

In [None]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(df_train_small)
dv_small.get_feature_names()

In [None]:
X_train_small = dv_small.transform(df_train_small)
X_train_small.shape

In [None]:
model_small = LogisticRegression()

In [None]:
model_small.fit(X_train_small,y_train)

In [None]:
w0 = model_small.intercept_[0]
w0

In [None]:
w = model_small.coef_[0].round(4)
w

In [None]:
dict(zip(dv_small.get_feature_names(),w))

# Training the model using val + train set and testing it using test set

In [None]:
dict_df_full = df_full_train[cat_columns+numerical_cols].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dict_df_full)
y_full_train = df_full_train.churn

In [None]:
model_full = LogisticRegression().fit(X_full_train,y_full_train)

In [None]:
dict_df_test = df_test[cat_columns+numerical_cols].to_dict(orient='records')
X_test = dv.transform(dict_df_test)

In [None]:
y_predict = model.predict_proba(X_test)[:,1]

In [None]:
churn_decision = y_predict >= 0.5

In [None]:
(churn_decision == y_test ).mean()

In [None]:
model.predict_proba(dv.transform(dict_df_test[100]))[0,1]

In [None]:
y_test[100]