In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

filepath=r"C:\Users\Owner\Downloads\archive (1)\Telco-Customer-Churn.csv" #FILE LOACTION ON MY COMPUTER

In [114]:
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [115]:
#Check data information  
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [116]:
#Check for missing values
print(data.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [117]:
#checking the number of uniques classes our categorical feayures have
categorical_column = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',  'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
for column in categorical_column:
    print(f'{column}:{data[column].nunique()} unique classes')

gender:2 unique classes
Partner:2 unique classes
Dependents:2 unique classes
PhoneService:2 unique classes
MultipleLines:3 unique classes
InternetService:3 unique classes
OnlineSecurity:3 unique classes
OnlineBackup:3 unique classes
DeviceProtection:3 unique classes
TechSupport:3 unique classes
StreamingTV:3 unique classes
StreamingMovies:3 unique classes
Contract:3 unique classes
PaperlessBilling:2 unique classes
PaymentMethod:4 unique classes
Churn:2 unique classes


In [118]:
#we need to check ehy Totalcharges is stored as object. first we check the uniques classes
#going throug the data manually I noticed that it was a mix of int, float and blank space. 
#so I had to convert everything to numeric value, check and fill the missing values. 

print(data['TotalCharges'].unique())

['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']


In [119]:
#This code converts all data entry under totlcharge to a numeric type, which will automatically turn invalid entries into NaN
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
print(data['TotalCharges'].isnull().sum())

11


In [120]:
median_totalcharges = data['TotalCharges'].median()
data['TotalCharges'] = data['TotalCharges'].fillna(median_totalcharges)
print(data['TotalCharges'].isnull().sum())


0


In [121]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [122]:
# Select only numerical features

encoder = LabelEncoder()
data['Churn'] = encoder.fit_transform(data['Churn'])




In [123]:
numerical_data = data.select_dtypes(include=['float64', 'int64'])

# Compute correlation
correlation_matrix = numerical_data.corr()

# Display correlations with the target variable
print(correlation_matrix['Churn'].sort_values(ascending=False))

Churn             1.000000
MonthlyCharges    0.193356
SeniorCitizen     0.150889
TotalCharges     -0.199037
tenure           -0.352229
Name: Churn, dtype: float64


In [124]:
#Encoding Caregorical values 
binary_columns = ['gender', 'Partner', 'Dependents','PhoneService', 'PaperlessBilling']
le = LabelEncoder()
for column in binary_columns:
    data[column] = le.fit_transform(data[column])

In [125]:
#onehot encoding non binary classes 
data = pd.get_dummies(data, columns=['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'])


In [126]:
print(data.head())

   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0  7590-VHVEG       0              0        1           0       1   
1  5575-GNVDE       1              0        0           0      34   
2  3668-QPYBK       1              0        0           0       2   
3  7795-CFOCW       1              0        0           0      45   
4  9237-HQITU       0              0        0           0       2   

   PhoneService  PaperlessBilling  MonthlyCharges  TotalCharges  ...  \
0             0                 1           29.85         29.85  ...   
1             1                 0           56.95       1889.50  ...   
2             1                 1           53.85        108.15  ...   
3             0                 0           42.30       1840.75  ...   
4             1                 1           70.70        151.65  ...   

   StreamingMovies_No  StreamingMovies_No internet service  \
0                True                                False   
1                True       

In [127]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7043 non-null   object 
 1   gender                                   7043 non-null   int64  
 2   SeniorCitizen                            7043 non-null   int64  
 3   Partner                                  7043 non-null   int64  
 4   Dependents                               7043 non-null   int64  
 5   tenure                                   7043 non-null   int64  
 6   PhoneService                             7043 non-null   int64  
 7   PaperlessBilling                         7043 non-null   int64  
 8   MonthlyCharges                           7043 non-null   float64
 9   TotalCharges                             7043 non-null   float64
 10  Churn                                    7043 no

In [128]:
data = data.drop('customerID', axis=1)

In [129]:
X = data.drop('Churn', axis=1)
y = data['Churn']

In [133]:
#we are going to build a LOgistic  Regression model and a Random forest model with all the features.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [134]:
lr = RandomForestClassifier()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [135]:
accuracy = accuracy_score(y_test, y_pred)

In [136]:
print (f'The accuracy is {accuracy*100.:2f}%')

The accuracy is 79.555135%


In [137]:
#I will now use RFE to slect features and see what the accuracy will be 
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Initialize RFE with Random Forest
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=15)
rfe.fit(X, y)

# Selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

# Create a reduced dataset
X_reduced = X[selected_features]


Selected Features: Index(['gender', 'SeniorCitizen', 'Partner', 'tenure', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'MultipleLines_No',
       'InternetService_Fiber optic', 'OnlineSecurity_No', 'OnlineBackup_No',
       'DeviceProtection_No', 'TechSupport_No', 'Contract_Month-to-month',
       'PaymentMethod_Electronic check'],
      dtype='object')


In [140]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)


In [155]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 


In [156]:
rad = RandomForestClassifier()
rad.fit(X_train, y_train)
y_pred = rad.predict(X_test)


In [157]:
rad_accuracy = accuracy_score(y_test, y_pred)

In [158]:
print(f'The RFE accuracy is: {rad_accuracy*100.:2f}%')

The RFE accuracy is: 79.460483%


Conclusion:Random Forest effectively models data with many features, achieving an accuracy of 79.55% when all features were used. After applying Recursive Feature Elimination (RFE) to select 15 key features, the accuracy was 74.46%, indicating that feature selection can influence model performance positively or negatively. This underscores the importance of carefully evaluating feature selection techniques. While RFE was the sole method explored here, other techniques may further refine the model. Future improvements could include hyperparameter tuning and cross-validation to enhance accuracy.