<a href="https://colab.research.google.com/github/IdajiliJohnOjochegbe/Customer-Churn-Prediction/blob/main/Customer_churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Telco customer churn dataset.csv')

In [3]:
print(df.head())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

# Explore the Dataset

In [4]:
# Basic information about the dataset
print(df.info())

# Statistical summary of the dataset
print(df.describe())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Data Preprocessing

In [5]:
# Check unique value counts
for column in df.columns:
    try:
        unique_values = df[column].nunique()
        print(f"Column: {column}, Unique Values: {unique_values}")
    except Exception as e:
        print(f"Column: {column}, Error: {e}")


Column: customerID, Unique Values: 7043
Column: gender, Unique Values: 2
Column: SeniorCitizen, Unique Values: 2
Column: Partner, Unique Values: 2
Column: Dependents, Unique Values: 2
Column: tenure, Unique Values: 73
Column: PhoneService, Unique Values: 2
Column: MultipleLines, Unique Values: 3
Column: InternetService, Unique Values: 3
Column: OnlineSecurity, Unique Values: 3
Column: OnlineBackup, Unique Values: 3
Column: DeviceProtection, Unique Values: 3
Column: TechSupport, Unique Values: 3
Column: StreamingTV, Unique Values: 3
Column: StreamingMovies, Unique Values: 3
Column: Contract, Unique Values: 3
Column: PaperlessBilling, Unique Values: 2
Column: PaymentMethod, Unique Values: 4
Column: MonthlyCharges, Unique Values: 1585
Column: TotalCharges, Unique Values: 6531
Column: Churn, Unique Values: 2


In [6]:
df.drop(columns=['customerID'], inplace=True)

# Convert 'TotalCharges' to numeric and handle errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values
print(df.isnull().sum())

# Fill missing values for 'TotalCharges' with the median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = df.select_dtypes(exclude=['float64', 'int64']).columns

# Fill missing values for numeric columns (if any)
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Fill missing values for non-numeric columns (if any)
df[non_numeric_cols] = df[non_numeric_cols].fillna(df[non_numeric_cols].mode().iloc[0])

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

print(df.head())
print(df.shape)

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
0              0       1           29.85         29.85        False   
1              0      34           56.95       1889.50         True   
2              0       2           53.85        108.15         True   
3              0      45           42.30       1840.75         True   
4              0       2           70.70        151.65        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False

# Model Building

In [10]:
from sklearn.model_selection import train_test_split
x = df.drop('Churn_Yes', axis=1)
y = df['Churn_Yes']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42)

### Training a logistics regression model

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

#evaluate the model

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"f1_score: {f1_score}")

Accuracy: 0.8204400283889283
Precision: 0.6807228915662651
Recall: 0.6058981233243967
f1_score: 0.6411347517730496
