In [135]:
# Import library
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [125]:
# Import dataset 
dataset = './data/telecom-customer-churn.csv'
popDataset = './data/telecom-zipcode-population.csv'
dataframe = pd.read_csv(dataset)
popDataframe = pd.read_csv(popDataset)
dataset = dataframe.head(10)
popDataset = popDataframe.head(10)

In [None]:
# Check null column
naRowCount = dataframe.isnull().sum()
popNaRowCount = popDataframe.isnull().sum()
print(f"NA row count of dataset : {naRowCount}")
print(f"NA row count of population dataset : {popNaRowCount}")

In [127]:
# Clean churn category and churn reason field by filling NaN values
dataframe['Churn Category'].fillna('Not provided', inplace=True)
dataframe['Churn Reason'].fillna('Not provided', inplace=True)

# Clean offer field by filling NaN values
dataframe['Offer'].fillna('None', inplace=True)

# Clean fields that depend on phone service subscription with a conditional replace
dataframe['Avg Monthly Long Distance Charges'] = dataframe.apply(lambda row: 0 if row['Phone Service'] == 'No' else row['Avg Monthly Long Distance Charges'], axis=1)
dataframe['Multiple Lines'] = dataframe.apply(lambda row: 'No' if row['Phone Service'] == 'No' else row['Multiple Lines'],axis=1)

# Clean fields that depend on internet service subscription with a conditional replace
internetServiceCols = ['Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data']
for col in internetServiceCols:
    dataframe[col] = dataframe.apply(lambda row: 'No' if row['Internet Service'] == 'No' else row[col], axis=1)
dataframe['Avg Monthly GB Download'] = dataframe.apply(lambda row: 0 if row['Internet Service'] == 'No' else row['Avg Monthly GB Download'], axis=1)

In [None]:
# Left join datataset and population dataset 
fullDataframe = pd.merge(dataframe, popDataframe, on = 'Zip Code', how = 'left')
fullDataframe = fullDataframe.drop(columns = ['Customer ID']) # Typically not use as a feature in ml model

# Check missing value
fullDataframe.isna().sum()

In [None]:
'''
# Note
# Split unseen dataset
dataframeRow = fullDataframe.shape[0]
unseenProportion = 0.2
unseenSize = int(dataframeRow * unseenProportion)
unseenDataset = fullDataframe.sample(n = unseenSize, random_state = 42)
fullDataframe = fullDataframe.drop(unseenDataset.index)

# Split feature and target of unseen dataset
unseenFeature = unseenDataset.drop(columns = ['Customer Status'])
unseenTarget = unseenDataset['Customer Status']
'''

In [130]:
# Use train test split
# Defind feature and target
feature = fullDataframe.drop(columns = ['Customer Status'])
target = fullDataframe['Customer Status'] 
xFeature, yFeature, xTarget, yTarget = train_test_split(feature, target, test_size=0.3, random_state=42)

In [131]:
# Data preprocessing
# Encode categorical features
categoricalField = ['Gender', 'Married', 'City', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan',
'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Churn Category', 'Churn Reason']

encoder = LabelEncoder()
xFeature[categoricalField] = xFeature[categoricalField].apply(encoder.fit_transform)
yFeature[categoricalField] = yFeature[categoricalField].apply(encoder.fit_transform)

# Scale numerical features
scaler = StandardScaler()
numericalField = ['Age', 'Number of Dependents', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals', 'Tenure in Months', 'Avg Monthly Long Distance Charges', 'Avg Monthly GB Download', 
'Monthly Charge', 'Total Charges', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Revenue', 'Population']
scaler.fit(xFeature[numericalField])
scaler.fit(yFeature[numericalField])
xFeature[numericalField] = scaler.transform(xFeature[numericalField])
yFeature[numericalField] = scaler.transform(yFeature[numericalField])

In [None]:
'''
# Note
# Train random forest model 
# K-fold cross validation
k = 5
kf = KFold(n_splits = k, shuffle = True, random_state = 42)
bestModel = None
bestScore = 0

for trainIndex, valIndex in kf.split(feature):
    featureTrain, featureVal = feature.iloc[trainIndex], feature.iloc[valIndex]
    targetTrain, targetVal = target.iloc[trainIndex], target.iloc[valIndex]
    
    rfModel = RandomForestClassifier()
    rfModel.fit(featureTrain, targetTrain)
    
    score = rfModel.score(featureVal, targetVal)
    if score > bestScore:
        bestScore = score
        bestModel = rfModel
print(f'Best score : {bestScore}')
'''

In [None]:
'''
# Note
# Prediction 
unseenDataEncoded = pd.get_dummies(unseenFeature)
unseenDataEncoded = unseenDataEncoded.reindex(columns=feature.columns, fill_value=0)
prediction = bestModel.predict(unseenDataEncoded)

# Scoring
accuracy = accuracy_score(unseenTarget, prediction)
precision = precision_score(unseenTarget, prediction, average='weighted')
recall = recall_score(unseenTarget, prediction, average='weighted')
f1 = f1_score(unseenTarget, prediction, average='weighted')
confMatrix = confusion_matrix(unseenTarget, prediction)
print(f'Accuracy : {accuracy}\n Precision : {precision} \n Recall : {recall} \n F1 score : {f1} \n Confusion matrix : {confMatrix}')
'''

In [137]:
# Train model with random forest
xFeatureEncoded = pd.get_dummies(xFeature)
xFeature = xFeatureEncoded.reindex(columns=feature.columns, fill_value=0)
rfModel = RandomForestClassifier()
rfModel = rfModel.fit(xFeature,xTarget) 

# Prediction 
rfPrediction = rfModel.predict(yFeature)
# Scoring
accuracy = accuracy_score(yTarget, rfPrediction)
precision = precision_score(yTarget, rfPrediction, average='weighted')
recall = recall_score(yTarget, rfPrediction, average='weighted')
f1 = f1_score(yTarget, rfPrediction, average='weighted')
confMatrix = confusion_matrix(yTarget, rfPrediction)
print(f'Accuracy : {accuracy}\n Precision : {precision} \n Recall : {recall} \n F1 score : {f1} \n Confusion matrix : {confMatrix}')

Accuracy : 1.0
 Precision : 1.0 
 Recall : 1.0 
 F1 score : 1.0 
 Confusion matrix : [[ 551    0    0]
 [   0  153    0]
 [   0    0 1409]]


In [139]:
# Train model with random forest
lrModel = LogisticRegression(solver='liblinear')
lrModel = lrModel.fit(xFeature,xTarget)

# Prediction 
lrPrediction = lrModel.predict(yFeature)
# Scoring
accuracy = accuracy_score(yTarget, lrPrediction)
precision = precision_score(yTarget, lrPrediction, average='weighted')
recall = recall_score(yTarget, lrPrediction, average='weighted')
f1 = f1_score(yTarget, lrPrediction, average='weighted')
confMatrix = confusion_matrix(yTarget, lrPrediction)
print(f'Accuracy : {accuracy}\n Precision : {precision} \n Recall : {recall} \n F1 score : {f1} \n Confusion matrix : {confMatrix}')

Accuracy : 0.9001419782300047
 Precision : 0.899063422653525 
 Recall : 0.9001419782300047 
 F1 score : 0.8961615133449948 
 Confusion matrix : [[ 430   21  100]
 [  15   90   48]
 [  15   12 1382]]
