In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [2]:
train = pd.read_csv("./data/training_data.csv")
train.drop("customerID",axis=1,inplace=True)

In [3]:
val = pd.read_csv("./data/validation_data.csv")
val.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0420-HLGXF,Female,1,No,No,39,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.75,4036.0,No
1,5193-QLVZB,Male,0,No,No,63,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),104.75,6536.5,No
2,5598-IKHQQ,Female,0,No,No,72,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),25.45,1866.45,No
3,8749-CLJXC,Male,0,No,No,1,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.05,20.05,No
4,2252-NKNSI,Male,0,No,Yes,52,Yes,Yes,DSL,Yes,...,No,Yes,Yes,Yes,Two year,Yes,Mailed check,85.15,4461.85,No


In [4]:
le_t = LabelEncoder()
le_t.fit(train.loc[:,'gender'])
print(le_t.classes_)

['Female' 'Male']


In [5]:

categorical_columns = [ 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'Churn']
col_mapper = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(train.loc[:,col])
    class_names = le.classes_
    train.loc[:,col] = le.transform(train.loc[:,col])
    col_mapper.update({col:le})
    

In [6]:
train.replace(" ","0",inplace=True)

In [7]:
train.loc[:,"TotalCharges"] = pd.to_numeric(train.loc[:,"TotalCharges"])

In [8]:
pd.to_numeric(train.loc[:,"TotalCharges"]).info()

<class 'pandas.core.series.Series'>
RangeIndex: 5282 entries, 0 to 5281
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
5282 non-null   float64
dtypes: float64(1)
memory usage: 41.4 KB


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5282 non-null   int64  
 1   SeniorCitizen     5282 non-null   int64  
 2   Partner           5282 non-null   int64  
 3   Dependents        5282 non-null   int64  
 4   tenure            5282 non-null   int64  
 5   PhoneService      5282 non-null   int64  
 6   MultipleLines     5282 non-null   int64  
 7   InternetService   5282 non-null   int64  
 8   OnlineSecurity    5282 non-null   int64  
 9   OnlineBackup      5282 non-null   int64  
 10  DeviceProtection  5282 non-null   int64  
 11  TechSupport       5282 non-null   int64  
 12  StreamingTV       5282 non-null   int64  
 13  StreamingMovies   5282 non-null   int64  
 14  Contract          5282 non-null   int64  
 15  PaperlessBilling  5282 non-null   int64  
 16  PaymentMethod     5282 non-null   int64  


In [10]:
train.to_csv("./data/train_numeric.csv")

In [11]:
def pre_prcess_data(df, label_encoder_dict):
    df.drop("customerID", axis=1, inplace=True)
    for col in df.columns:
        if col in list(label_encoder_dict.keys()):
            column_le = label_encoder_dict[col]
            df.loc[:,col] = column_le.transform(df.loc[:,col])
        else:
            continue
    return df

In [12]:
x_train = train.drop("Churn",axis=1)
y_train = train.loc[:,"Churn"]


In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)
val = pre_prcess_data(val,col_mapper)
x_val = val.drop("Churn",axis=1)
y_val = val.loc[:,"Churn"]

In [18]:
import numpy as np

from sklearn.inspection import permutation_importance


model_fi = permutation_importance(model, x_train, y_train)
model_fi['importances_mean']   

array([-4.16508898e-04,  7.95153351e-04, -7.95153351e-04, -2.65051117e-04,
        1.31692541e-01,  2.76410451e-03,  4.54373343e-04,  1.62817115e-03,
        7.64861795e-03,  1.93108671e-03,  9.84475577e-04,  3.90003786e-03,
       -6.81560015e-04, -3.78644453e-05,  2.18477849e-02,  6.89132904e-03,
        1.13593336e-04,  3.11624385e-02,  3.93790231e-02])

In [19]:
x_train.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [18]:
predictions = model.predict(x_val)

In [26]:

y_val = pd.to_numeric(y_val)

In [27]:


# predicting on validation
predictions = model.predict(x_val)
precision, recall, fscore, support = precision_recall_fscore_support(y_val, predictions)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation accuracy is: {round(accuracy, 3)}")

Validation accuracy is: 0.834


In [28]:
precision

array([0.89036545, 0.62264151])

In [29]:
recall

array([0.89932886, 0.6       ])

In [30]:
fscore

array([0.89482471, 0.61111111])

In [31]:
support

array([596, 165], dtype=int64)

In [32]:
pickler = open("churn_prediction_model.pkl","wb")
pickle.dump(model,pickler)
pickler.close()

In [33]:
pickler = open("churn_prediction_label_encoder.pkl","wb")
pickle.dump(col_mapper, pickler)
pickler.close()