In [None]:
# Importing and loading relevant libraries and packages
import warnings
from itertools import *

import plotly.express as px
from plotly.subplots import *
import plotly.graph_objects as go
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn import *
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

import xgboost as xgb
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
%matplotlib inline


# Hiding the warnings
warnings.filterwarnings('ignore')

print("Loading complete.", "Warnings hidden.")

In [None]:
# Removing the restriction on columns to display
pd.set_option('display.max_columns', None)

In [None]:
# Loading the data
dataset = pd.read_csv("Telco-Customer-Churn.csv")
dataset

In [None]:
dataset.info()

In [None]:
dataset[dataset["tenure"] == 0]

In [None]:
dataset["MonthlyCharges"] = dataset["MonthlyCharges"].apply(float)
dataset["TotalCharges"] = dataset["TotalCharges"].replace(" ", np.nan)
dataset["TotalCharges"] = dataset["TotalCharges"].apply(float)
dataset.dropna(inplace = True)
dataset.info()

In [None]:
dataset.drop(columns = ["customerID"], inplace = True)

In [None]:
dataset.nunique()

In [None]:
#for column in dataset.columns:
#    if dataset[column].dtype == "O":
#        data = pd.DataFrame(dataset[column].value_counts()).reset_index().rename(columns = {"index":"Category"})
#        data
#        fig = px.bar(data, x = data.columns[0], y = data.columns[1], text_auto = True,
#                     title = f"{column} Distribution")
#        fig.show()

In [None]:
# Looking at the descriptive statistics of the columns with categorical values
categoricals = [column for column in dataset.columns if (dataset[column].dtype == "O")]

for column in dataset[categoricals].columns:
    fig = px.histogram(dataset, x= dataset[column], title = f"Distribution of values in the {column} column")
    fig.show()
    fig = px.histogram(dataset, x= dataset[column], color = "Churn", barnorm = "percent", text_auto = ".2f",
                       title = f"Churn proportions of users in {column}")
    fig.show()

In [None]:
# Visualizing the distribution of the variables and their churn levels
#for column in dataset.columns:
#    fig = px.histogram(dataset, x= dataset[column], title = f"Distribution of values in the {column} column")
#    fig.show()

In [None]:
# Visualizing the distribution of the variables and their churn levels
for column in dataset.columns:
    fig = px.histogram(dataset, x= dataset[column], facet_col = "Churn", 
                       title = f"Distribution of values in the {column} column")
    fig.show()

In [None]:
# Visualizing the distribution of the variables and their churn levels
for column in dataset.columns:
    fig = px.histogram(dataset, x= dataset[column], color = "Churn", barnorm = "percent", text_auto = ".2f",
                       title = f"Distribution of values in the {column} column")
    fig.show()

In [None]:
# Looking at the descriptive statistics of the columns with numeric values
numerics = [column for column in dataset.columns if (dataset[column].dtype != "O") & (len(dataset[column].unique()) > 2)]
dataset[numerics].describe()

In [None]:
for column in dataset[numerics].columns:
    if len(dataset[column].unique()) > 2:
        fig = px.box(dataset[numerics], y = column, title = f"Boxplot of values in the {column} column")
        fig.show()

In [None]:
# Encoding columns with only two unique variables
label_encoder = preprocessing.LabelEncoder()
dataset["Churn"] = label_encoder.fit_transform(dataset.Churn)
dataset["gender"] = label_encoder.fit_transform(dataset.gender)
dataset["SeniorCitizen"] = label_encoder.fit_transform(dataset.SeniorCitizen)
dataset["Partner"] = label_encoder.fit_transform(dataset.Partner)
dataset["Dependents"] = label_encoder.fit_transform(dataset.Dependents)
dataset["PhoneService"] = label_encoder.fit_transform(dataset.PhoneService)
dataset["PaperlessBilling"] = label_encoder.fit_transform(dataset.PaperlessBilling)
dataset

In [None]:
# Looking at the correlation between the variables in the merged dataframe
correlation = pd.DataFrame(dataset.corr())
fig = px.imshow(correlation, text_auto = ".3f", aspect = "auto", labels = {"color":"Correlation Coefficient"})
fig.update_xaxes(side="top")
fig.show()