In [5]:
from math import sqrt
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import env
import os

In [6]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [7]:
def get_telco_data():
    filename = "telco_churn.csv"

    if os.path.isfile("telco_churn.csv"):
        return pd.read_csv("telco_churn.csv")
    else:
        # read the SQL query into a dataframe
        df = pd.read_sql('''
SELECT *
FROM customers
JOIN contract_types ON contract_types.contract_type_id = customers.contract_type_id
JOIN internet_service_types ON internet_service_types.internet_service_type_id = customers.internet_service_type_id
JOIN payment_types ON payment_types.payment_type_id = customers.payment_type_id;
''', get_connection("telco_churn"))

        # Write that dataframe to disk for later. Called "caching" the data for later.
        df.to_csv("telco_churn.csv")

        # Return the dataframe to the calling code
        return df 

In [14]:
df = get_telco_data()

In [15]:
def clean_telco(df):
    df["partner"] = df.partner.replace(to_replace = ["Yes","No"],value = [1,0])
    df["dependents"] = df.dependents.replace(to_replace = ["Yes","No"],value = [1,0])
    df["churn"] = df.churn.replace(to_replace = ["Yes","No"],value = [1,0])
    df["paperless_billing"] = df.paperless_billing.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["tech_support"] = df.tech_support.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["device_protection"] = df.device_protection.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["online_backup"] = df.online_backup.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["online_security"] = df.online_security.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["streaming_tv"] = df.streaming_tv.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["streaming_movies"] = df.streaming_movies.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["phone_service"] = df.phone_service.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
    df["multiple_lines"] = df.multiple_lines.replace(to_replace = ["Yes","No","No phone service"],value = [1,0,0])
    df.drop(columns = ["gender","customer_id",
                   "multiple_lines",
                   "contract_type",
                   "internet_service_type",
                   "payment_type",
                   "total_charges",
                   'internet_service_type_id',
                   "contract_type_id",
                   "payment_type_id",
                  "contract_type_id.1",
                   "internet_service_type_id.1",
                   "payment_type_id.1",
                   "Unnamed: 0"],inplace = True)
    df["total_charges_clean"] = df.monthly_charges * df.tenure
    return df
    

In [16]:
df = clean_telco(df)

In [18]:
len(df)

7043

In [19]:
df.head()

Unnamed: 0,senior_citizen,partner,dependents,tenure,phone_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,churn,total_charges_clean
0,0,1,1,65,1,1,1,1,1,1,1,1,90.45,0,5879.25
1,0,0,0,54,0,1,0,0,1,1,0,0,45.2,0,2440.8
2,0,0,0,56,0,1,1,1,1,0,0,0,45.05,0,2522.8
3,0,1,1,20,0,1,0,1,1,0,0,1,39.4,0,788.0
4,0,1,0,72,1,0,1,1,1,1,1,1,85.15,0,6130.8


In [20]:
df["add_ons"] = df.phone_service + df.online_security + df.online_backup + df.device_protection + df.tech_support + df.streaming_tv + df.streaming_movies

In [21]:
df.head()

Unnamed: 0,senior_citizen,partner,dependents,tenure,phone_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,churn,total_charges_clean,add_ons
0,0,1,1,65,1,1,1,1,1,1,1,1,90.45,0,5879.25,7
1,0,0,0,54,0,1,0,0,1,1,0,0,45.2,0,2440.8,3
2,0,0,0,56,0,1,1,1,1,0,0,0,45.05,0,2522.8,4
3,0,1,1,20,0,1,0,1,1,0,0,1,39.4,0,788.0,3
4,0,1,0,72,1,0,1,1,1,1,1,1,85.15,0,6130.8,6
