In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib as plt

In [57]:
df = pd.read_csv("CustomerChurn_dataset.csv")

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [59]:
limit = 30
percentage = (df.isnull().sum()/len(df))*100
drop = percentage[percentage > limit].index
df.drop(columns = drop, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [60]:
numbers = df.select_dtypes(include=["number"])
objects = df.select_dtypes(exclude=["number"])

In [61]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
label = LabelEncoder()

for i in objects.columns:
  objects[i] = label.fit_transform(objects[i])

In [62]:
objects

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
0,5375,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,2505,0
1,3962,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,1466,0
2,2564,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,157,1
3,5535,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,1400,0
4,6511,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,925,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4853,1,1,1,1,2,0,2,0,2,2,2,2,1,1,3,1597,0
7039,1525,0,1,1,1,2,1,0,2,2,0,2,2,1,1,1,5698,0
7040,3367,0,1,1,0,1,0,2,0,0,0,0,0,0,1,2,2994,0
7041,5934,1,1,0,1,2,1,0,0,0,0,0,0,0,1,3,2660,1


In [63]:
df_combined = pd.concat([numbers, objects], axis=1)
df_combined.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
0,0,1,29.85,5375,0,1,0,0,1,0,...,2,0,0,0,0,0,1,2,2505,0
1,0,34,56.95,3962,1,0,0,1,0,0,...,0,2,0,0,0,1,0,3,1466,0
2,0,2,53.85,2564,1,0,0,1,0,0,...,2,0,0,0,0,0,1,3,157,1
3,0,45,42.3,5535,1,0,0,0,1,0,...,0,2,2,0,0,1,0,0,1400,0
4,0,2,70.7,6511,0,0,0,1,0,1,...,0,0,0,0,0,0,1,2,925,1


In [64]:
corr_matrix = df_combined.corr()
corr_matrix["Churn"].sort_values()

Contract           -0.396713
tenure             -0.352229
OnlineSecurity     -0.289309
TechSupport        -0.282492
OnlineBackup       -0.195525
DeviceProtection   -0.178134
Dependents         -0.164221
Partner            -0.150448
InternetService    -0.047291
StreamingMovies    -0.038492
StreamingTV        -0.036581
customerID         -0.017447
gender             -0.008612
PhoneService        0.011942
TotalCharges        0.014479
MultipleLines       0.038037
PaymentMethod       0.107062
SeniorCitizen       0.150889
PaperlessBilling    0.191825
MonthlyCharges      0.193356
Churn               1.000000
Name: Churn, dtype: float64

In [65]:
#Split into features and target
#features = X_train.iloc[:,:-1]
y = df_combined["Churn"]
X = df_combined.drop(["Churn"], axis=1)
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges
0,0,1,29.85,5375,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,2505
1,0,34,56.95,3962,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,1466
2,0,2,53.85,2564,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,157
3,0,45,42.30,5535,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,1400
4,0,2,70.70,6511,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,4853,1,1,1,1,2,0,2,0,2,2,2,2,1,1,3,1597
7039,0,72,103.20,1525,0,1,1,1,2,1,0,2,2,0,2,2,1,1,1,5698
7040,0,11,29.60,3367,0,1,1,0,1,0,2,0,0,0,0,0,0,1,2,2994
7041,1,4,74.40,5934,1,1,0,1,2,1,0,0,0,0,0,0,0,1,3,2660


In [66]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the numeric columns
scaled = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=X.columns)

In [67]:
rfr = RandomForestClassifier()
rfr.fit(X, y)

features = X.columns
feature_importance = rfr.feature_importances_
importances = pd.DataFrame({'Feature': features, 'Importance': feature_importance})

importances = importances.sort_values(by='Importance', ascending=False)
importances

Unnamed: 0,Feature,Importance
1,tenure,0.154642
2,MonthlyCharges,0.141489
19,TotalCharges,0.13515
3,customerID,0.13066
16,Contract,0.074585
10,OnlineSecurity,0.04894
18,PaymentMethod,0.045536
13,TechSupport,0.043808
11,OnlineBackup,0.025322
9,InternetService,0.024757


# EDA

In [70]:
ax = plt.subplots(figsize=(30, 24))
ax = sns.barplot(x=feature_scores, y=feature_scores.index, data=importances)
ax.set_title("Visualize feature scores of the features")
ax.set_yticklabels(feature_scores.index)
ax.set_xlabel("Feature importance score")
ax.set_ylabel("Features")
plt.show()

AttributeError: module 'matplotlib' has no attribute 'subplots'

In [68]:
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)