In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# from helper import get_mi_score
%matplotlib inline

In [None]:
plt.rcParams['axes.grid'] = True   # Default behaviour for all graphs. 

In [None]:
telecom_data = pd.read_csv(r"C:\Users\kidus\synergyTelecom\data\WA_Fn-UseC_-Telco-Customer-Churn.csv", index_col= "customerID")

*"Fix" the TotalCharges columns*

In [None]:
# Since customers with no tenure have no TotalCharges, we will fill the respective cells with 0 instead of an empty string.
no_tenure = telecom_data[telecom_data.tenure == 0]   

total_charges_filled = telecom_data[telecom_data.index.isin(no_tenure.index)].copy()  # select from the original dataframe whose index is in the "no_tenure" dataframe.
total_charges_filled["TotalCharges"] = 0  

telecom_data.update(total_charges_filled)
telecom_data["TotalCharges"] = telecom_data["TotalCharges"].astype(float)

*KDE to show the relationship between tenure and churn*

In [None]:
plt.title("Kernel Density Estimate of tenure for Churned vs Unchurned customers")
plt.xlabel("Tenure (months)")
sns.kdeplot(data = telecom_data,  x = "tenure", hue = "Churn", fill = False)

*Illustrate the relationship between contract type and churn*

In [None]:
plt.title("Contract type of churned and unchurned customers")
sns.countplot(data = telecom_data, x = "Contract", hue = "Churn")
telecom_data.Contract.value_counts()

*Monthly Charges for churned and unchurned customers*

In [None]:
plt.xlabel("Monthly charges")
sns.kdeplot(data = telecom_data, x = "MonthlyCharges", hue = "Churn")

# The plot shows MonthlyCharge isn't, on its own, discriminative enough.  

In [None]:
sns.countplot(data = telecom_data, x = "SeniorCitizen", hue = "Churn")

In [None]:
y = telecom_data["Churn"].map({"Yes": 1, "No":0})

X = telecom_data.drop("Churn", axis = 1)
X["gender"] = X["gender"].map({"Male": 0, "Female": 1})      # Mapping the gender columns. 0 refers to Male and 1 refers to Female.

closed_columns = ["Partner", "Dependents", "PhoneService"]  # These are columns which have only Yes or No values, basically binary. 
for col in closed_columns:      # Encode the columns. 0 means No and 1 means Yes. 
    X[col] = X[col].map({"No": 0, "Yes": 1})

In [None]:
# only_nums_X = X.select_dtypes(include= "number")
# num_features = only_nums_X.dtypes == float | int
# get_mi_score(only_nums_X, y, discrete_features= num_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 1)
X_train: pd.DataFrame
X_test: pd.DataFrame
y_train: pd.Series
y_test: pd.Series

*Select columns for encoding and encode appropriately*

In [112]:
numerical_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_cols = X_train.select_dtypes(include= ["object", "category", "string"]).columns.to_list()

std_scaler = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown= "ignore")

preprocessor = ColumnTransformer(
                                    transformers= [
                                        ("numeric", std_scaler, numerical_cols), 
                                        ("categorical", categorical_transformer, categorical_cols)
                                        ])


*Train models with encoded data to compare results.*