In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install seaborn

In [None]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info(verbose = True)

In [None]:
df.describe()

###  observations:
- average tenure is 32 months
- 75% customers have tenure less than 55 months
- average montly charges are USD 64
- 25% customers pay more than 89$ permonth

In [None]:
df.columns

In [None]:
df['Churn'].value_counts().plot(kind='barh')
plt.xlabel("Count")
plt.ylabel("Target Variable")

In [None]:
#calculating the percent of those who churn
100*df['Churn'].value_counts()/len(df['Churn'])

### observations:
- the ratio of who who churn to those who don't is 73:26
- the data is highly imbalanced
- we need to analyse further with other features for bettwe insights

# Data cleaning

In [None]:
df1 = df.copy()

In [None]:
# Converting total charges to numerical datatype
df1.TotalCharges = pd.to_numeric(df1.TotalCharges, errors = 'coerce')
df1.isnull().sum()

In [None]:
df1['TotalCharges'].isnull().sum()

In [None]:
100*(11/7043)

since the % percent of missing values is only 0.15%, we can ignore them

In [None]:
# removing the missing data
df1.dropna(inplace=True)

segregating customers based on their tenure

In [None]:
max_tenure = df1['tenure'].max()
max_tenure

In [None]:
labels = ["{0} - {1}".format(i, i + 11) for i in range(1, 72, 12)]

df1['tenure_bins'] = pd.cut(df1.tenure, range(1, 80, 12), right=False, labels=labels)

In [None]:
df1['tenure_bins'].value_counts()

In [None]:
df.drop(columns=['customerID','tenure'], axis=1, inplace=True)

# Data Exploration

## Univariate Analysis

In [None]:
for i, predictor in enumerate(df1.drop(columns=['customerID','tenure' ,'Churn', 'TotalCharges', 'MonthlyCharges'])):
    plt.figure(i)
    sns.countplot(data=df1, x=predictor, hue='Churn')
    

In [None]:
# converting churn to a binary variable
df1['Churn'] = np.where(df1.Churn=='Yes',1,0)

In [None]:
df1.head()

In [None]:
# converting all categorical variables to dummy varaibles
df1 = pd.get_dummies(df1)
df1.head()

In [None]:
sns.lmplot(data=df1, x='MonthlyCharges', y='TotalCharges')

### Observation:
- total charges increase with increase in monthly charges

### realtion betwen churn, montly charges and total charges

In [None]:
fig1 = sns.kdeplot(df1.MonthlyCharges[(df1["Churn"] == 0) ], fill = True)
fig1 = sns.kdeplot(df1.MonthlyCharges[(df1["Churn"] == 1) ],ax=fig1, fill= True)
fig1.legend(["No Churn","Churn"],loc='upper right')
fig1.set_ylabel('Density')
fig1.set_xlabel('Monthly Charges')
fig1.set_title('Monthly charges by churn')


In [None]:
fig2 = sns.kdeplot(df1.TotalCharges[(df1["Churn"] == 0) ], fill = True)
fig2 = sns.kdeplot(df1.TotalCharges[(df1["Churn"] == 1) ],ax=fig2, fill= True)
fig2.legend(["No Churn","Churn"],loc='upper right')
fig2.set_ylabel('Density')
fig2.set_xlabel('Total Charges')
fig2.set_title('Total charges by churn')


### Observations:
- churn incrrease with increase in monthly charges
- churn decrease with increase in total charges
- Low tenure, high monthly charges and lower total charges lead to high churn

In [None]:
# figsize = (20,8)
# df1.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')

## Bivariate analysis

In [None]:
def uniplot(df, col, title, hue=None):
    temp = pd.Series(data=hue)
    fig, ax=plt.subplots()
    plt.yscale('log')
    plt.title(title)
    ax = sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.show()