Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pickle

In [None]:
!pip install scikit-learn


In [None]:
!pip install seaborn

In [None]:
!python -m pip install --upgrade pip


Data Loading and Understanding

In [None]:
#Load the csv data to a pandas dataframe
df = pd.read_csv("C:/Users/DELL/Desktop/ML Project1/WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [None]:
df.shape

In [None]:
df.head()


In [None]:
pd.set_option("display.max_columns",None)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#droppping customer ID column as this not require for modeling
df = df.drop("customerID", axis=1)


In [None]:
print(df.columns)

In [None]:
print(df["gender"].unique())

In [None]:
print(df["SeniorCitizen"].unique())

In [None]:
#printing the unique values in all the columns
numerical_features_list = ["tenure","MonthlyCharges","TotalCharges"]

for col in df.columns:
    if col not in numerical_features_list:
       print(col, df[col].unique())
       print ("-"*50)

In [None]:
print(df.isnull().sum())

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
len(df[df["TotalCharges"]==" "])

In [None]:
df["TotalCharges"] = df["TotalCharges"].replace({" ":"0.0"})

In [None]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df.info()

In [None]:
#checking the distribution of target column
print(df["Churn"].value_counts())

**Insights:**
1. Customer ID removed as it is not required for modeling
2. No missing values in the dataset
3. Missing values in TotalCharges column were replaced with 0
4. Class imbalance identified in the target


**3.Exploratory Data Analysis (EDA)**

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df.describe()

**Numerical Features - Analysis**

Understand the distribution of numerical features

In [None]:
def plot_histogram(df, column_name):
    
    plt.figure(figsize=(5,3))
    sns.histplot(df[column_name],kde=True)
    plt.title(f"Distribution of {column_name}")
    

    #calculate the mean and median values for the columns
    col_mean = df[column_name].mean()
    col_median = df[column_name].median()

    #add vertical lines for mean and median
    plt.axvline(col_mean, color="red",linestyle="--", label="Mean")
    plt.axvline(col_median, color="green",linestyle="-", label="Median")

    plt.legend()

    plt.show()

In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

**Box Plot for numerical features**

In [None]:
def plot_boxplot(df, column_name):
    plt.figure(figsize=(5,3))
    sns.boxplot(x=df[column_name])
    plt.title(f"Distribution of {column_name}")
    plt.ylabel(column_name)
    plt.show()

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df, "MonthlyCharges")

In [None]:
plot_boxplot(df, "TotalCharges")