In [1]:
# IMPORT LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SET MAX COLS AND ROWS
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# IMPORT DATA
cust_churn_df = pd.read_csv('data/cust-churn-data-full.csv', low_memory=False)

print("\nColumn names:", cust_churn_df.columns)
print("\nShape of dataset:", cust_churn_df.shape)
print("\nMissing values:", cust_churn_df.isnull().sum())


Column names: Index(['account_id', 'customer_id', 'tenure_months', 'num_referrals',
       'has_internet_service', 'internet_type', 'has_unlimited_data',
       'has_phone_service', 'has_multiple_lines', 'has_premium_tech_support',
       'has_online_security', 'has_online_backup', 'has_device_protection',
       'contract_type', 'paperless_billing', 'payment_method', 'account_id.1',
       'avg_long_distance_fee_monthly', 'total_long_distance_fee',
       'avg_gb_download_monthly', 'stream_tv', 'stream_movie', 'stream_music',
       'total_monthly_fee', 'total_charges_quarter', 'total_refunds',
       'customer_id.1', 'status', 'churn_label', 'churn_category',
       'churn_reason', 'area_id', 'zip_code', 'city', 'latitude', 'longitude',
       'population', 'customer_id.2', 'gender', 'age', 'senior_citizen',
       'married', 'num_dependents', 'zip_code.1'],
      dtype='object')

Shape of dataset: (7043, 44)

Missing values: account_id                          0
customer_id        

In [2]:
# FOR EDA - REMOVE DUPLICATED COLS (indicated by suffix) FROM JOINING SQL TABLES
dup_cols = [col for col in cust_churn_df.columns if '.' in col]
print(f"Duplicated columns: {dup_cols}")

eda_cust_churn_df = cust_churn_df.drop(['account_id.1', 'customer_id.1', 'customer_id.2', 'zip_code.1'], axis=1)

print("\nColumn names:", eda_cust_churn_df.columns)
print("\nShape of dataset:", eda_cust_churn_df.shape) #Shape of dataset: (7043, 40)
print("\nMissing values:", eda_cust_churn_df.isnull().sum())

Duplicated columns: ['account_id.1', 'customer_id.1', 'customer_id.2', 'zip_code.1']

Column names: Index(['account_id', 'customer_id', 'tenure_months', 'num_referrals',
       'has_internet_service', 'internet_type', 'has_unlimited_data',
       'has_phone_service', 'has_multiple_lines', 'has_premium_tech_support',
       'has_online_security', 'has_online_backup', 'has_device_protection',
       'contract_type', 'paperless_billing', 'payment_method',
       'avg_long_distance_fee_monthly', 'total_long_distance_fee',
       'avg_gb_download_monthly', 'stream_tv', 'stream_movie', 'stream_music',
       'total_monthly_fee', 'total_charges_quarter', 'total_refunds', 'status',
       'churn_label', 'churn_category', 'churn_reason', 'area_id', 'zip_code',
       'city', 'latitude', 'longitude', 'population', 'gender', 'age',
       'senior_citizen', 'married', 'num_dependents'],
      dtype='object')

Shape of dataset: (7043, 40)

Missing values: account_id                          0
custo

In [3]:
eda_cust_churn_df.head()
# eda_cust_churn_df.describe()

Unnamed: 0,account_id,customer_id,tenure_months,num_referrals,has_internet_service,internet_type,has_unlimited_data,has_phone_service,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,contract_type,paperless_billing,payment_method,avg_long_distance_fee_monthly,total_long_distance_fee,avg_gb_download_monthly,stream_tv,stream_movie,stream_music,total_monthly_fee,total_charges_quarter,total_refunds,status,churn_label,churn_category,churn_reason,area_id,zip_code,city,latitude,longitude,population,gender,age,senior_citizen,married,num_dependents
0,BFIN-DLMOA,0013-EXCHZ,3,3,Yes,Fiber Optic,Yes,Yes,No,Yes,No,No,No,Month-to-Month,Yes,Credit Card,7.38,22.14,11,Yes,No,No,83.9,267.4,0.0,Churned,Yes,Dissatisfaction,Network reliability,607,93010,Camarillo,34.227846,-119.079903,42853,Female,75,Yes,Yes,0
1,AFEO-XOOCP,0014-BMAQU,63,8,Yes,Fiber Optic,No,Yes,Yes,Yes,Yes,No,No,Two Year,Yes,Credit Card,12.96,816.48,7,No,No,No,84.65,5377.8,0.0,Stayed,No,,,963,94558,Napa,38.489789,-122.27011,63947,Male,52,No,Yes,0
2,DEMQ-MFXWC,0016-QLJIS,65,3,Yes,Cable,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,28.46,1849.9,14,Yes,Yes,Yes,90.45,5957.9,0.0,Stayed,No,,,1390,95681,Sheridan,38.984756,-121.345074,1219,Female,43,No,Yes,1
3,AIPP-VTDXJ,0019-EFAEP,72,0,Yes,Fiber Optic,Yes,Yes,Yes,No,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal,2.25,162.0,16,Yes,No,No,101.3,7261.25,0.0,Stayed,No,,,303,91942,La Mesa,32.782501,-117.01611,24005,Female,32,No,No,0
4,CJHA-SRKIB,0019-GFNTW,56,0,Yes,DSL,Yes,No,No,Yes,Yes,Yes,Yes,Two Year,No,Bank Withdrawal,0.0,0.0,19,No,No,No,45.05,2560.1,0.0,Stayed,No,,,716,93441,Los Olivos,34.70434,-120.02609,1317,Female,39,No,No,0


In [4]:
eda_cust_churn_df['status'].value_counts()

status
Stayed     4720
Churned    1869
Joined      454
Name: count, dtype: int64

In [5]:
# Relabeling Churn Label Status to binary form
eda_cust_churn_df['churn_label'] = eda_cust_churn_df['churn_label'].map({'Yes': 1, 'No': 0})
eda_cust_churn_df['churn_label'].value_counts()

churn_label
0.0    5174
1.0    1817
Name: count, dtype: int64

In [42]:
# Loop through all columns and print value counts for each
for column in eda_cust_churn_df.columns:
    print(f"Value counts for column '{column}':")
    print(eda_cust_churn_df[column].value_counts())
    print("\n" + "-"*40 + "\n")

Value counts for column 'account_id':
account_id
BFIN-DLMOA    1
NEWT-UTTDQ    1
GVEE-PDLRR    1
ZFLT-FYMEG    1
CSIJ-CHRDZ    1
             ..
DHEB-RFPIB    1
QHMM-FEKOR    1
UYYN-GDJUL    1
LGZP-LIBGE    1
PCTD-RXANG    1
Name: count, Length: 7043, dtype: int64

----------------------------------------

Value counts for column 'customer_id':
customer_id
0013-EXCHZ    1
3587-PMCOY    1
5780-INQIK    1
5707-ORNDZ    1
5288-AHOUP    1
             ..
8198-RKSZG    1
8050-WYBND    1
8033-ATFAS    1
7996-BPXHY    1
9992-UJOEL    1
Name: count, Length: 7043, dtype: int64

----------------------------------------

Value counts for column 'tenure_months':
tenure_months
1     613
72    362
2     238
3     200
4     176
71    170
5     133
7     131
10    127
8     123
9     119
70    119
12    117
6     110
13    109
68    100
15     99
11     99
67     98
18     97
69     95
24     94
22     90
66     89
35     88
17     87
23     85
16     80
52     80
56     80
64     80
25     79
26     

In [13]:
# 0. DATA CLEANING ---------------------------------------------------------------------------------------------------------
# Split 'account_id' into two parts and drop the original 'account_id' column
clean_cust_churn_df[['acc_id_p1', 'acc_id_p2']] = eda_cust_churn_df['account_id'].str.split('-', expand=True)
clean_cust_churn_df.drop('account_id', axis=1, inplace=True)

# Split 'customer_id' into two parts and drop the original 'customer_id' column
clean_cust_churn_df[['cust_id_p1', 'cust_id_p2']] = clean_cust_churn_df['customer_id'].str.split('-', expand=True)
clean_cust_churn_df.drop('customer_id', axis=1, inplace=True)

# Convert 'cust_id_p1' to integer to determine if there is a correlation between time of subscription with telecomm and churn label - predict loyalty level
clean_cust_churn_df['cust_id_p1'] = clean_cust_churn_df['cust_id_p1'].astype(int)

# Check the result
clean_cust_churn_df.head(10)

Unnamed: 0,tenure_months,num_referrals,has_internet_service,internet_type,has_unlimited_data,has_phone_service,has_multiple_lines,has_premium_tech_support,has_online_security,has_online_backup,has_device_protection,contract_type,paperless_billing,payment_method,avg_long_distance_fee_monthly,total_long_distance_fee,avg_gb_download_monthly,stream_tv,stream_movie,stream_music,total_monthly_fee,total_charges_quarter,total_refunds,status,churn_label,churn_category,churn_reason,area_id,zip_code,city,latitude,longitude,population,gender,age,senior_citizen,married,num_dependents,acc_id_p1,acc_id_p2,cust_id_p1,cust_id_p2
0,3,3,Yes,Fiber Optic,Yes,Yes,No,Yes,No,No,No,Month-to-Month,Yes,Credit Card,7.38,22.14,11,Yes,No,No,83.9,267.4,0.0,Churned,1.0,Dissatisfaction,Network reliability,607,93010,Camarillo,34.227846,-119.079903,42853,Female,75,Yes,Yes,0,BFIN,DLMOA,13,EXCHZ
1,63,8,Yes,Fiber Optic,No,Yes,Yes,Yes,Yes,No,No,Two Year,Yes,Credit Card,12.96,816.48,7,No,No,No,84.65,5377.8,0.0,Stayed,0.0,,,963,94558,Napa,38.489789,-122.27011,63947,Male,52,No,Yes,0,AFEO,XOOCP,14,BMAQU
2,65,3,Yes,Cable,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,28.46,1849.9,14,Yes,Yes,Yes,90.45,5957.9,0.0,Stayed,0.0,,,1390,95681,Sheridan,38.984756,-121.345074,1219,Female,43,No,Yes,1,DEMQ,MFXWC,16,QLJIS
3,72,0,Yes,Fiber Optic,Yes,Yes,Yes,No,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal,2.25,162.0,16,Yes,No,No,101.3,7261.25,0.0,Stayed,0.0,,,303,91942,La Mesa,32.782501,-117.01611,24005,Female,32,No,No,0,AIPP,VTDXJ,19,EFAEP
4,56,0,Yes,DSL,Yes,No,No,Yes,Yes,Yes,Yes,Two Year,No,Bank Withdrawal,0.0,0.0,19,No,No,No,45.05,2560.1,0.0,Stayed,0.0,,,716,93441,Los Olivos,34.70434,-120.02609,1317,Female,39,No,No,0,CJHA,SRKIB,19,GFNTW
5,71,9,Yes,Fiber Optic,Yes,Yes,Yes,No,No,Yes,Yes,Two Year,Yes,Credit Card,27.26,1935.46,12,No,Yes,Yes,95.75,6849.4,0.0,Stayed,0.0,,,685,93286,Woodlake,36.464635,-119.094348,8870,Female,58,No,Yes,2,CDSG,JLJOW,20,INWCK
6,50,0,No,,No,Yes,Yes,No,No,No,No,One Year,No,Bank Withdrawal,31.43,1571.5,0,No,No,No,25.2,1306.3,0.0,Stayed,0.0,,,767,93601,Ahwahnee,37.375816,-119.739935,1968,Female,79,Yes,Yes,0,JEHS,UYRVX,23,UYUPN
7,23,1,Yes,Fiber Optic,Yes,Yes,Yes,No,No,No,No,Month-to-Month,Yes,Bank Withdrawal,34.91,802.93,30,Yes,No,No,83.75,1849.95,0.0,Stayed,0.0,,,632,93201,Alpaugh,35.869626,-119.498771,1054,Female,30,No,Yes,2,HBIR,OSCLJ,27,KWYKW
8,55,10,Yes,Fiber Optic,No,Yes,No,Yes,No,Yes,Yes,One Year,Yes,Bank Withdrawal,35.04,1927.2,57,Yes,Yes,Yes,103.7,5656.75,0.0,Stayed,0.0,,,803,93648,Parlier,36.622237,-119.521126,12587,Female,37,No,Yes,3,EDRJ,TSFHQ,36,IHMOT
9,37,1,Yes,Fiber Optic,Yes,Yes,No,No,No,No,No,One Year,No,Credit Card,43.01,1591.37,51,Yes,Yes,Yes,91.2,3247.55,0.0,Stayed,0.0,,,873,94038,Moss Beach,37.515556,-122.502311,3064,Male,23,No,Yes,3,ILYA,YMAZQ,48,LUMLS


In [49]:
# Convert both 'zip_code' and 'area_id' columns to string (more meaningful)
clean_cust_churn_df[['zip_code', 'area_id']] = clean_cust_churn_df[['zip_code', 'area_id']].astype(str)

In [None]:
# BASE MODEL ANALYSIS WITHOUT CLEANING OR FEATURE ENGINEERING