In [63]:
!pip install pandas
!pip install numpy
!pip install scikit-learn



In [64]:
import pandas as pd
import numpy as np
import sklearn as skl

In [65]:
df = pd.read_csv("telco_churn.csv")
df['Churn'].replace({'Yes': True, 'No': False}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Churn'].replace({'Yes': True, 'No': False}, inplace=True)
  df['Churn'].replace({'Yes': True, 'No': False}, inplace=True)


In [66]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,False
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,False
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,True
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,False
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,True


In [67]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)
df.columns = ['customer_id', 'gender', 'senior_citizen', 'partner', 'dependents', 'tenure', 'phone_service', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing', 'payment_method', 'monthly_charges', 'total_charges', 'churn']
df_enc = pd.get_dummies(df, columns = ['gender', 'senior_citizen', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing', 'payment_method'])

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [68]:
scaler = skl.preprocessing.StandardScaler()
df_scaled = scaler.fit_transform(df_enc[['tenure', 'monthly_charges', 'total_charges']])
df_scaled = pd.DataFrame(df_scaled, columns=['tenure', 'monthly_charges', 'total_charges'])
df_scaled = pd.concat([df_enc.drop(['tenure', 'monthly_charges', 'total_charges'], axis=1), df_scaled], axis=1)
df_scaled.head()

Unnamed: 0,customer_id,churn,gender_Female,gender_Male,senior_citizen_0,senior_citizen_1,partner_No,partner_Yes,dependents_No,dependents_Yes,...,contract_Two year,paperless_billing_No,paperless_billing_Yes,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,tenure,monthly_charges,total_charges
0,7590-VHVEG,False,True,False,True,False,False,True,True,False,...,False,False,True,False,False,True,False,-1.280248,-1.161694,-0.994194
1,5575-GNVDE,False,False,True,True,False,True,False,True,False,...,False,True,False,False,False,False,True,0.064303,-0.260878,-0.17374
2,3668-QPYBK,True,False,True,True,False,True,False,True,False,...,False,False,True,False,False,False,True,-1.239504,-0.363923,-0.959649
3,7795-CFOCW,False,False,True,True,False,True,False,True,False,...,False,True,False,True,False,False,False,0.512486,-0.74785,-0.195248
4,9237-HQITU,True,True,False,True,False,True,False,True,False,...,False,False,True,False,False,True,False,-1.239504,0.196178,-0.940457


In [78]:
df_scaled.dropna(inplace=True)

In [99]:
logistic_regression = skl.linear_model.LogisticRegression(penalty='l1', C=0.1, solver='liblinear')
X = df_scaled.drop(columns=['customer_id','churn'], axis=1)
y = df_scaled['churn'].astype(int) # Convert boolean 'churn' to integer
grid_search = skl.model_selection.GridSearchCV(logistic_regression, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, cv=None)
grid_search.fit(X, y)
print(grid_search.best_params_)

{'C': 1}


In [101]:
skl.model_selection.cross_val_score(grid_search, X, y, cv=None)

array([0.78220641, 0.79985755, 0.77920228, 0.77849003, 0.78632479])

In [102]:
random_forest = skl.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1)
random_forest.fit(X, y)
skl.model_selection.cross_val_score(random_forest, X, y, cv=None)

array([0.77935943, 0.79273504, 0.77777778, 0.77635328, 0.78561254])

In [104]:
importances = random_forest.feature_importances_
feature_names = X.columns
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

                                     Feature  Gini Importance
34                   contract_Month-to-month         0.210968
16                        online_security_No         0.125054
25                           tech_support_No         0.096505
14              internet_service_Fiber optic         0.095088
41           payment_method_Electronic check         0.075573
36                         contract_Two year         0.075519
19                          online_backup_No         0.034607
35                         contract_One year         0.027216
18                       online_security_Yes         0.023368
29          streaming_tv_No internet service         0.021892
13                      internet_service_DSL         0.021689
27                          tech_support_Yes         0.021286
22                      device_protection_No         0.018048
26          tech_support_No internet service         0.014290
17       online_security_No internet service         0.012619
38      

In [105]:
import joblib
filename = 'logistic_reg.joblib'
joblib.dump(grid_search, filename)
filename = 'random_forest.joblib'
joblib.dump(random_forest, filename)
df_scaled.to_csv('telco_churn_enc.csv', index=False)