In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly
import altair as alt
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data.csv')
df_train, tv_set = train_test_split(df, test_size=.3, random_state=42)
df_test, df_val = train_test_split(tv_set, test_size = .5, random_state=42)
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [2]:
df_train.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1695,4906-ZHGPK,Male,0,Yes,Yes,54,Yes,Yes,DSL,No,...,No,Yes,Yes,No,One year,Yes,Electronic check,70.7,3770.0,No
1095,7439-DKZTW,Male,0,No,No,1,Yes,No,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Electronic check,80.55,80.55,No
3889,2592-HODOV,Male,0,No,No,13,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,No,Credit card (automatic),19.3,259.65,No
3667,7826-VVKWT,Female,1,Yes,Yes,24,Yes,No,Fiber optic,No,...,Yes,No,Yes,Yes,Two year,Yes,Electronic check,96.55,2263.45,No
2902,1658-TJVOA,Female,1,No,No,6,Yes,No,Fiber optic,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,74.1,450.9,No


In [3]:
def ConvertToBool(df, cols):
    for col in cols:
        if df[col].dtype != 'string':
            continue
        else:
            df[col] = (df[col] != 'No').astype(bool) 
    return df

def CalcBenefits (df, cols):
    for col in cols:
        df[col] = (df[col] == 'Yes').astype(int)
    return df[cols].sum(axis=1) 

# Lil bit of initial feature engineering
df_train = ConvertToBool(df_train.copy(), ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn'])
df_train['AutoPayment'] = df_train['PaymentMethod'].str.contains('(automatic)', case = True)
df_train['Benefits'] = CalcBenefits(df_train.copy(), ['OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'])
df_train['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)



  df_train['AutoPayment'] = df_train['PaymentMethod'].str.contains('(automatic)', case = True)


In [4]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px



def DistributionPlotter (df, skip_columns, ncolumns):
    df = df.drop(columns=skip_columns)
    plot_row_count = math.ceil((df.shape[1]) / ncolumns)
    fig = make_subplots(rows=plot_row_count, cols=ncolumns, subplot_titles=df.columns)

    for i, col in enumerate(df.columns):
        ncol = (i % ncolumns) + 1 
        nrow = (i // ncolumns) + 1 
        # This one only works since the dataset is synthetic, otherwise a lot of the categorical columns would likely contain too many categories to display on a histplot
        if not np.issubdtype(df[col].dtype, np.number):
            fig.add_trace(
                go.Histogram(x=df[col], name=f'{col}'), 
                row=nrow, col=ncol
            )
        else: 
            fig.add_trace(
                go.Histogram(x=df[col], name=f'{col}', nbinsx=20),
                row=nrow, col=ncol
            )
    fig.update_layout(height=400 * plot_row_count, title_text='Distribution of Columns', showlegend=True)
    fig.show()
grid = DistributionPlotter(df_train, ['customerID'], 4)