## Importing Necessory Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)

## Loading the data

In [None]:
ls

In [None]:
original_df = pd.read_csv("data_set_name")

In [None]:
df = original_df.copy()

# EDA

### Proprocessing

In [None]:
# first 5 data
df.head()

In [None]:
# total size of the data set
print("Number of Rows in data: ", df.shape[0])
print("Number of columns in data: ", df.shape[1])

In [None]:
# checking basic info about dataset
df.info()

In [None]:
# Checking for null values
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
# checking statistical properties of numerical data
df.describe().T

In [None]:
plt.title("Count Of each dependent variable")
(df["Variable"].value_counts()).plot(kind='bar')
plt.show()

In [None]:
df["Column"].value_counts()/len(df["Column"])

In [None]:
# creating data frame for churned customer only
churned_customer_df = df[df["column"] == "Yes"]

**Separating numerical and categorical variables**

In [None]:
# extracting categorical variables
cat_variables = df.select_dtypes(
    include=['object', 'category']).columns.tolist()
cat_variables

In [None]:
numerical_variables = [col for col in df.columns if col not in cat_variables]
numerical_variables

## Uni Variate Analysis

In [None]:
# Finding variables with unique counts < input
def find_unique(df, count=1):
    cols = df.columns
    unique_var_list = []
    for col in cols:
        if df[col].nunique() <= count and col != "Churn":
            unique_var_list.append(col)
    return unique_var_list

In [None]:
unique_var_list = find_unique(df, 7)

In [None]:
len(unique_var_list)

In [None]:
# Finding variables with unique counts < input in churned customer data
unique_var_list_churned = find_unique(churned_customer_df, 7)

In [None]:
len(unique_var_list_churned)

In [None]:
# checking for all the unique columns from unique variable list included in churned unique var list
for _str in unique_var_list:
    if _str not in unique_var_list_churned:
        print(_str)

Every columns from unique var list is included in churned unique variable and lenth of both list are same.So no missing variables

In [None]:
# getting other columns which are not included in above analysis which has unique count > 5 ie: Continues variables

for col_name in df.columns:
    if col_name not in unique_var_list and col_name != "Churn":
        print(
            f"variable Name {col_name} ....... unique value count: {df[col_name].nunique()}")

### Proportion of each variable in data

In [None]:
# Categerical Variables
for col in unique_var_list:
    plt.title("Percentage of class for " + col + " in data set")
    (df[col].value_counts()/len(df)).plot(kind='pie', autopct='%.2f')
    plt.show()

In [None]:
# Continues Variables
plt.title("Monthly Charges distribution in complete dataset")
print(df["MonthlyCharges"].describe())
sns.violinplot(df["MonthlyCharges"], inner="box",
               palette="Set3", cut=2, linewidth=3)
plt.show()

### Univariate analysis based on dependant variable

In [None]:
unique_var_list

In [None]:
# creating count plot  for every categorical variable
for col in unique_var_list:
    plt.title(f"{col} vs dep variable name")
    sns.countplot(data=df, x=col, hue="dependent variable name")
    plt.xticks(rotation=45)
    plt.show()

### Univariate Analysis of dependent variable

In [None]:
# ploting how much every variable churned
for col in unique_var_list:
    if col != "depenent_variable":
        plt.title(col + " Vs dependent variable ")
        data = (churned_customer_df.groupby(col)[
                "dependent variable"].count()/len(churned_customer_df[col]))
        p1 = data.plot(kind="bar")
        for p in p1.containers:
            p1.bar_label(p, fmt='%.2f%%', label_type='edge')
        print(data.reset_index())
        plt.show()

## Bivariate Multi variate analysis

In [None]:
# converting the values yes to 1 and no to zero
df['dependent variable'] = np.where(df.Churn == 'Yes', 1, 0)

In [None]:
df_dummies_no_dropped = pd.get_dummies(data=df)

In [None]:
df_dummies_no_dropped.shape

In [None]:
df_dummies_dropped = pd.get_dummies(data=df, drop_first=True)

In [None]:
df_dummies_dropped.to_csv("df_dummies_dropped.csv")

In [None]:
df_dummies_dropped.shape

In [None]:
len(unique_var_list)

#### Categorical Vs Categerical

In [None]:
def heatmap_for_two_category(unique_var_list, data, title=" Title"):
    for i in range(len(unique_var_list)):
        print(
            f">>>>>>>>>>>>>>> {unique_var_list[i]} vs other categorical Variables")
        for j in range(i+1, len(unique_var_list)):
            plt.title(title + f" {unique_var_list[i]} vs {unique_var_list[j]}")
            sns.heatmap(pd.crosstab(
                data[unique_var_list[i]], data[unique_var_list[j]]), annot=True, fmt='.0f')
            plt.show()

In [None]:
heatmap_for_two_category(unique_var_list=unique_var_list,
                         data=churned_customer_df, title="Churned data for")

#### Continues variables

In [None]:
numerical_variables

In [None]:
plt.title("Monthly Charges vs Total Charges")
sns.scatterplot(x=df["MonthlyCharges"], y=df["TotalCharges"], hue=df["Churn"])
plt.show()

### Correlation plots

In [None]:
df.corr()["Churn"].sort_values(ascending=False).plot(kind='bar')

This plot only gives information about numerical variables. So we can create a one hot encoded data set test correlation again

In [None]:
plt.figure(figsize=(20, 8))
plt.title("Correlation With Churn")
df_dummies_no_dropped.corr()["Churn"].sort_values(
    ascending=False).plot(kind="bar")

In [None]:
plt.figure(figsize=(30, 12))
plt.title("Correlation Map")
sns.heatmap(df_dummies_no_dropped.corr())

In [None]:
# dumping the one hot encoded data without reducing columns
df_dummies_no_dropped.to_csv("df_dummies_no_dropped.csv")