# Churn Prediction Project

### Data Preparation:
- Download Dataset, read it with pandas
- Look at the data
- Make column names and values look uniform
- Check if the columns are read correctly
- Check if the churn variable needs any preparation

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
filename = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [3]:
import wget
data = wget.download(filename)
print(data)

  0% [                                                                            ]      0 / 977501  0% [                                                                            ]   8192 / 977501  1% [.                                                                           ]  16384 / 977501  2% [.                                                                           ]  24576 / 977501  3% [..                                                                          ]  32768 / 977501  4% [...                                                                         ]  40960 / 977501  5% [...                                                                         ]  49152 / 977501  5% [....                                                                        ]  57344 / 977501  6% [.....                                                                       ]  65536 / 977501  7% [.....                                                                       ]  73728 / 977501

 94% [.......................................................................     ] 925696 / 977501 95% [........................................................................    ] 933888 / 977501 96% [.........................................................................   ] 942080 / 977501 97% [.........................................................................   ] 950272 / 977501 98% [..........................................................................  ] 958464 / 977501 98% [........................................................................... ] 966656 / 977501 99% [........................................................................... ] 974848 / 977501100% [............................................................................] 977501 / 977501WA_Fn-UseC_-Telco-Customer-Churn (5).csv


In [4]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [7]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [8]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [9]:
tc.isnull().sum()

11

In [10]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [11]:
df.totalcharges = df.totalcharges.fillna(0)

In [11]:
df[tc.isnull()][['customerid', 'totalcharges']]

KeyboardInterrupt: 

In [None]:
df.churn = (df.churn == 'yes').astype(int)

In [None]:
df.churn

**Setting up Validation Framework**

- perform the train/validation/test split with Scikit-Learn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_fulltrain, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [None]:
df_train, df_val = train_test_split(df_fulltrain, test_size=0.25, random_state=1)

In [None]:
len(df_fulltrain), len(df_train), len(df_test), len(df_val)

In [None]:
 len(df_train) + len(df_test) +len(df_val) , len(df)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

### EDA
- check missing values
- Look at the target variable(churn)
- Look at the numerical and categorical variables

In [None]:
df_fulltrain = df_fulltrain.reset_index(drop=True)

In [None]:
df_fulltrain

In [None]:
df_fulltrain.isnull().sum()

In [None]:
df_fulltrain.churn.value_counts()

In [None]:
df_fulltrain.churn.value_counts(normalize=True) #Churn rate is 0.269968 or 27% is the rate at which users churn

In [None]:
global_churn_rate  = df_fulltrain.churn.mean() #computing the mean of churn gives us the churn rate
round(global_churn_rate, 2)

In [None]:
# The mean of the target value in a binary dataset gives us the sum of the 1's over total nos of values

In [None]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod'] #remove customerid, churn and the 3 above numerical variables

In [None]:
df_fulltrain[categorical].nunique()

# Feature Importance: Churn rate and risk ratio
#### Feature importance analysis (part of EDA) - identifying which features affect our target variable
- Churn rate
- Risk ratio
- Mutual information

### Churn rate
- We look at the churn rate within each groups for example males and females

In [None]:
churn_female = df_fulltrain[df_fulltrain.gender == 'female'].churn.mean()
churn_female

In [None]:
churn_male = df_fulltrain[df_fulltrain.gender == 'male'].churn.mean()
churn_male

In [None]:
global_churn_rate  = df_fulltrain.churn.mean()
global_churn_rate

In [None]:
churn_partner = df_fulltrain[df_fulltrain.partner == 'yes'].churn.mean()
churn_partner

In [None]:
global_churn_rate - churn_partner

In [None]:
churn_no_partner = df_fulltrain[df_fulltrain.partner == 'no'].churn.mean()
churn_no_partner # less likely to churn, global churn is > 0 and more than group churn

In [None]:
global_churn_rate - churn_no_partner # More likely to churn because the global churn rate is < 0 & less than the group churn rate

### Risk Ratio

In [None]:
churn_no_partner / global_churn_rate # The risk ratio is > 1 and more likely to churn

In [None]:
churn_partner / global_churn_rate # Risk ratio is < 1 and less likely to churn

In [None]:
#   SELECT
#        gender,
#        AVG(churn),
#        AVG(churn) - global_churn_rate AS diff,
#        AVG(churn) / global_churn_rate AS risk
#   FROM
#        data
#   GROUP BY
#        gender 

In [None]:
# Coding the above for all variable
df_fulltrain.groupby('gender').churn.mean()

In [None]:
from IPython.display import display

In [None]:
for c in categorical:
    print(c)
    df_group = df_fulltrain.groupby(c).churn.agg(['mean', 'count']) #this allows us to manually add the diff &
    # risk columns
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()
    print()

# Feature Importance: Mutual Information   
Mutual information - concept from information theory, it tells us how much we can learn 
one variable if we know the value of another

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score(df_fulltrain.churn, df_fulltrain.contract)

In [None]:
mutual_info_score(df_fulltrain.gender, df_fulltrain.churn)

In [None]:
mutual_info_score(df_fulltrain.churn, df_fulltrain.partner)

In [None]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_fulltrain.churn)

In [None]:
mi = df_fulltrain[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

# Feature Importance: Correlation. 
#### How about numerical columns?   
    - Correlation Coefficient

In [None]:
df_fulltrain.tenure.max()

In [None]:
df_fulltrain[numerical]

In [None]:
df_fulltrain[numerical].corrwith(df_fulltrain.churn)

In [None]:
df_fulltrain[df_fulltrain.tenure <= 2].churn.mean()

In [None]:
df_fulltrain[(df_fulltrain.tenure > 2) & df_fulltrain.tenure <= 12].churn.mean()

In [None]:
df_fulltrain[df_fulltrain.tenure > 12].churn.mean()

In [None]:
df_fulltrain[df_fulltrain.monthlycharges <= 20].churn.mean()

In [None]:
df_fulltrain[(df_fulltrain.monthlycharges > 20) & df_fulltrain.monthlycharges <= 50].churn.mean()

In [None]:
df_fulltrain[df_fulltrain.monthlycharges > 50].churn.mean()

In [None]:
df_fulltrain[numerical].corrwith(df_fulltrain.churn).abs() # To get the feature importance in order of 
#the highest important variable irrespective of the direction we use .abs()

### One-hot encoding  
  
- Use Scikit-Learn to encode categorical features

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
df_train[['gender', 'contract']].iloc[:100]

In [None]:
dicts = df_train[['gender', 'contract']].iloc[:100].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)

In [None]:
dv.fit(dicts)

In [None]:
dv.get_feature_names_out()

In [None]:
dv.transform(dicts)

In [None]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [None]:
train_dicts[0]

In [None]:
dv = DictVectorizer(sparse=False)

In [None]:
dv.fit(train_dicts)

In [None]:
dv.transform(train_dicts)

In [None]:
dv.get_feature_names_out()

In [None]:
X_train = dv.fit_transform(train_dicts)

In [None]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [None]:
X_val = dv.transform(val_dicts)

### Logistic Regression  
- Binary Classification  
- Linear vs Logistic Regression

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
z = np.linspace(-7, 7, 51)
z

In [None]:
sigmoid(z)

In [None]:
plt.plot(z, sigmoid(z))

In [None]:
def linear_regression(xi):
    result = w0
    
    for i in range(len(w)):
        result = result + xi[j] * w[j]
        
    return result

In [None]:
def logistic_regression(xi):
    score = w0
    
    for i in range(len(w)):
        score = score + xi[j] * w[j]
        
    result = sigmoid(score)
    return result

### Train Logistic Regression with Scikit-Learn  
- Train a model with Scikit-Learn
- Apply it to the validation dataset
- Calculate the accuracy

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.intercept_[0]

In [None]:
model.coef_[0].round(2)

In [None]:
model.predict_proba(X_train)

In [None]:
model.predict_proba(X_train)[:, 1]

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
churn_decision = (y_pred >= 0.5)

In [None]:
df_val[churn_decision].customerid # These are the people that are most likely to churn
# and need to recieve promotional emails

In [None]:
# Manual inplementation of accuracy score
churn_decision.astype(int)

In [None]:
(y_val == churn_decision).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [None]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred

In [None]:
df_pred.correct.mean()

### Model Interpretation  
- Look at the coefficients
- Train a smaller model with fewer features

In [None]:
dv.get_feature_names_out()

In [None]:
model.coef_[0].round(3)

In [None]:
a = [1, 2, 3, 4]
b = 'abdc'
dict(zip(a, b))

In [None]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

In [None]:
small = ['contract', 'tenure', 'monthlycharges']

In [None]:
df_train[small].iloc[:10].to_dict(orient='records')

In [None]:
dict_train_small = df_train[small].to_dict(orient='records')
dict_val_small = df_val[small].to_dict(orient='records')

In [None]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dict_train_small)

In [None]:
dv_small.get_feature_names_out()

In [None]:
X_train_small = dv_small.transform(dict_train_small)

In [None]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [None]:
w0 = model_small.intercept_[0]
w0

In [None]:
w = model_small.coef_[0]
w.round(3)

In [None]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

In [None]:
#imaginary customer with monthly charges of 50, tenure 5 and monthly contract
sigmoid(-2.47 + 0.97 + 50 * 0.027 + 5 * (-0.036))

In [None]:
-2.47 + 0.97 + 50 * 0.027 + 5 * (-0.036)

In [None]:
sigmoid(_) # _ takes the output of the previous cell

### Using the model

In [None]:
dict_full_train = df_fulltrain[categorical + numerical].to_dict(orient='records')

In [None]:
dv = DictVectorizer()
X_full_train = dv.fit_transform(dict_full_train)

In [None]:
y_full_train = df_fulltrain.churn.values

In [None]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

In [None]:
dict_test = df_test[categorical + numerical].to_dict(orient='records')

In [None]:
X_test = dv.transform(dict_test)

In [None]:
y_pred = model.predict_proba(X_test)[:, 1]

In [None]:
churn_decision = (y_pred >= 0.5)

In [None]:
(churn_decision == y_test).mean()

In [None]:
customer10 = dict_test[-1] #use example dict_test[10]
customer10

In [None]:
X_small = dv.transform([customer10])

In [None]:
model.predict_proba(X_small)[0, 1]

In [None]:
y_test[-1]