In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import prep
import preprocess
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from scipy import stats
from datetime import datetime
from env import get_connection
from sklearn.model_selection import train_test_split
from acquire import get_telco_data

## Acquire

In [2]:
df = get_telco_data()

df.head()

found data


Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,signup_date,churn_month,payment_type,contract_type,internet_service_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,Yes,2,65.6,593.3,No,2021-04-21 18:07:34,,Mailed check,One year,DSL
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,2,59.9,542.4,No,2021-04-21 18:07:34,,Mailed check,Month-to-month,DSL
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,1,73.9,280.85,Yes,2021-09-21 18:07:34,2022-01-31,Electronic check,Month-to-month,Fiber optic
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,1,98.0,1237.85,Yes,2020-12-21 18:07:34,2022-01-31,Electronic check,Month-to-month,Fiber optic
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,Yes,2,83.9,267.4,Yes,2021-10-21 18:07:34,2022-01-31,Mailed check,Month-to-month,Fiber optic


In [3]:
df.shape

(7043, 26)

## Prep

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   object 
 13  streaming_tv              7043 non-null   object 
 14  streamin

In [5]:
df.churn.value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [6]:
# Find columns with missing values and the total of missing values.

missing = df.isnull().sum()
missing[missing > 0]


churn_month    5174
dtype: int64

In [7]:
# Validate that missing values 

df.isna().sum()


customer_id                    0
gender                         0
senior_citizen                 0
partner                        0
dependents                     0
tenure                         0
phone_service                  0
multiple_lines                 0
internet_service_type_id       0
online_security                0
online_backup                  0
device_protection              0
tech_support                   0
streaming_tv                   0
streaming_movies               0
contract_type_id               0
paperless_billing              0
payment_type_id                0
monthly_charges                0
total_charges                  0
churn                          0
signup_date                    0
churn_month                 5174
payment_type                   0
contract_type                  0
internet_service_type          0
dtype: int64

In [8]:
df = prep.drop_cols(df)
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,Mailed check,One year,DSL
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Mailed check,Month-to-month,DSL
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Electronic check,Month-to-month,Fiber optic
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Electronic check,Month-to-month,Fiber optic
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Mailed check,Month-to-month,Fiber optic


In [9]:
prep.change_dtype(df)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.60,593.30,No,Mailed check,One year,DSL
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.90,542.40,No,Mailed check,Month-to-month,DSL
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.90,280.85,Yes,Electronic check,Month-to-month,Fiber optic
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.00,1237.85,Yes,Electronic check,Month-to-month,Fiber optic
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.90,267.40,Yes,Mailed check,Month-to-month,Fiber optic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,0,No,No,13,Yes,No,Yes,No,No,Yes,No,No,No,55.15,742.90,No,Mailed check,One year,DSL
7039,Male,0,Yes,No,22,Yes,Yes,No,No,No,No,No,Yes,Yes,85.10,1873.70,Yes,Electronic check,Month-to-month,Fiber optic
7040,Male,0,No,No,2,Yes,No,No,Yes,No,No,No,No,Yes,50.30,92.75,No,Mailed check,Month-to-month,DSL
7041,Male,0,Yes,Yes,67,Yes,No,Yes,No,Yes,Yes,No,Yes,No,67.85,4627.65,No,Mailed check,Two year,DSL


In [10]:
df.total_charges = df.total_charges.replace(' ', 0)

In [11]:
df.total_charges = df.total_charges.astype(float)
df.total_charges.head()

0     593.30
1     542.40
2     280.85
3    1237.85
4     267.40
Name: total_charges, dtype: float64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 7043 non-null   object 
 1   senior_citizen         7043 non-null   int64  
 2   partner                7043 non-null   object 
 3   dependents             7043 non-null   object 
 4   tenure                 7043 non-null   int64  
 5   phone_service          7043 non-null   object 
 6   multiple_lines         7043 non-null   object 
 7   online_security        7043 non-null   object 
 8   online_backup          7043 non-null   object 
 9   device_protection      7043 non-null   object 
 10  tech_support           7043 non-null   object 
 11  streaming_tv           7043 non-null   object 
 12  streaming_movies       7043 non-null   object 
 13  paperless_billing      7043 non-null   object 
 14  monthly_charges        7043 non-null   float64
 15  tota

In [13]:
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})

In [14]:
df.signup_date = (pd.to_datetime(df.signup_date)).dt.date

AttributeError: 'DataFrame' object has no attribute 'signup_date'

In [None]:
df.signup_date

# Split train, val, test

In [15]:
train, val, test = prep.train_val_test(df, 'churn')


In [16]:
train.churn.value_counts()

0    3622
1    1308
Name: churn, dtype: int64

In [17]:
train.shape, val.shape, test.shape

((4930, 20), (1056, 20), (1057, 20))

## Exploration

In [None]:
sns.pairplot(df, corner=True)
plt.suptitle("sns.pairplot visualizes continuous variable relationships")
plt.show()


## Correlation

In [None]:
x = train.total_charges
y = train.tenure

In [None]:
def corr(x, y):

    corr, p = stats.pearsonr(x, y)
    
    return corr, p

In [None]:
x.info(), y.info()

In [None]:
corr(x, y)

In [None]:
x = train.monthly_charges
y = train.tenure
corr(x, y)

## Visualize

In [None]:
train.info()

In [None]:
observed = pd.crosstab(train.churn, train.phone_service)
observed


In [None]:
sns.catplot(data=train, x="churn", y="tenure", kind = 'bar')

In [None]:
train.churn.value_counts()

In [None]:
sns.countplot(data= train, x="gender", hue="churn")

In [None]:
train.info()

In [None]:
train[:]

In [None]:
#can I create a functiom the calls all my columns and applies it to a countplot

def c_plot(df, target):
    
    df = df[:]
    
    for col in df:
    
        return sns.countplot(data= df, x= col , hue= target)

In [None]:
c_plot(train, 'churn' )

In [None]:
cross_tab = pd.crosstab(train['churn'], train['gender'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
train = train.drop(columns = 'gender')

In [None]:
val = val.drop(columns = 'gender')
test = test.drop(columns = 'gender')

In [None]:
cross_tab = pd.crosstab(train['churn'], train['dependents'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="dependents", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['senior_citizen'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="senior_citizen", hue="churn")

In [None]:
train.streaming_movies.value_counts()

In [None]:
s = train[train.streaming_movies != 'No internet service']


In [None]:
cross_tab = pd.crosstab(train['churn'], train['payment_type'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="payment_type", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['phone_service'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="phone_service", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['partner'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="partner", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['tech_support'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="tech_support", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['internet_service_type'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="internet_service_type", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['streaming_tv'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="streaming_tv", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['multiple_lines'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="multiple_lines", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['contract_type'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
sns.countplot(data= train, x="contract_type", hue="churn")

In [None]:
cross_tab = pd.crosstab(train['churn'], train['contract_type'])

chi2_stat, p_val, dof, expected = stats.chi2_contingency(cross_tab)

print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_val < 0.05:
    print("There is a significant association between the variables.")
else:
    print("There is no significant association between the variables.")


In [None]:
g = sns.catplot(
    data= train,
    x="tenure", y="churn", row="dependents",
    kind="box", orient="h",
    sharex=False, margin_titles=True,
    height=1.5, aspect=4,
)
g.set(xlabel="tenure", ylabel="churn")
g.set_titles(row_template="{row_name} dependents")
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter('{x:.0f} mo')

## trying to build a function

In [None]:
# an array of all observed values of the subgroup
def ones_onet(df, col, order_by):
    
    sample = df[col == 'Yes'].order_by

# the population mean
    overall_mean = order_by.mean()

    t, p = stats.ttest_1samp(sample, overall_mean)

    print(t, p/2)

In [None]:
col = train.churn
order_by = monthly_charges
ones_onet(train, col, order_by)

## one sample, one tailed, ttest (significantly higher)

In [None]:
train['DateCategory'] = train['signup_date'].dt.day

In [None]:
a = .05

churn_sample = train[train.churn == 'Yes'].signup_date

overall_mean = train.signup_date.mean()

t, p = stats.ttest_1samp(churn_sample, overall_mean)

print(t, p/2)

a =.05

if p/2 > a:
    print("We fail to reject the null hypothesis.")

elif t < 0:
    print("We fail to reject null hypothesis.")

else:
    print("We reject the null hypothesis.")


In [None]:
sns.catplot(data=train, x="churn", y="signup_date", kind = 'bar')

In [None]:
a = .05

churn_sample = train[train.churn == 'Yes'].monthly_charges

overall_mean = train.monthly_charges.mean()

t, p = stats.ttest_1samp(churn_sample, overall_mean)

print(t, p/2)

a =.05

if p/2 > a:
    print("We fail to reject the null hypothesis.")

elif t < 0:
    print("We fail to reject null hypothesis.")

else:
    print("We reject the null hypothesis.")


# one sample, 2 tailed ttest (significantly diff)

In [None]:
churn_sample = train[train.churn == 'Yes'].monthly_charges

overall_mean = train.monthly_charges.mean()

t, p = stats.ttest_1samp(churn_sample, overall_mean)

print(t, p)

a = .05

if p < a:
    
    print(f'the result is significant we reject the null hypothesis with a p_value of {round(p, 2)}.')
    
else:
        
    print(f'we fail to reject the null hypothesis with a p_value of {round(p, 2)}.')



## 2 sample, 1 tailed ttest (significantly higher)

In [None]:
churn_sample = train[train.churn == 'Yes'].monthly_charges
no_churn_sample = train[train.churn == 'No'].monthly_charges

print(churn_sample.var())
print(no_churn_sample.var())

In [None]:
t, p = stats.ttest_ind(churn_sample, no_churn_sample, equal_var=False)
print('t =',t)
print('p =',p)

a = .05

print("is p/2 < alpha? ", p / 2 < alpha)
print("is t > 0? ", t > 0)

if p / 2 > a:
    print("We fail to reject the null hypothesis")
elif t < 0:
    print("We fail to reject the null hypothesis")
else:
    print("We reject the null hypothesis")


## 2 sample, 2 tailed ttest (significantly diff)

In [None]:
t, p = stats.ttest_ind(churn_sample, no_churn_sample, equal_var=False)
print('t =',t)
print('p =',p)

a = .05

if p < a:
    
    print(f'the result is significant we reject the null hypothesis with a p_value of {round(p, 2)}.')
    
else:
        
    print(f'we fail to reject the null hypothesis with a p_value of {round(p, 2)}.')


In [None]:
churn_sample = train[df.churn == 'Yes'].monthly_charges
no_churn_sample = train[df.churn == 'No'].monthly_charges

print(churn_sample.var())
print(no_churn_sample.var())

In [None]:
train.isna().sum()

In [None]:
val.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.info()

In [18]:
train = preprocess.drop_insign(train)
val = preprocess.drop_insign(val)

## Modeling

In [19]:
X_train = train.drop(columns = 'churn')
y_train = train.churn

X_train = pd.get_dummies(X_train)
X_train = X_train.drop(columns = ['online_security_No internet service', 'online_backup_No internet service', 'device_protection_No internet service', 'tech_support_No internet service', 'streaming_tv_No internet service', 'streaming_movies_No internet service'])
X_train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,partner_No,partner_Yes,dependents_No,dependents_Yes,online_security_No,online_security_Yes,...,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None
5609,0,14,76.45,1117.55,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
2209,0,5,70.0,347.4,1,0,1,0,1,0,...,0,0,0,1,0,1,0,1,0,0
6919,0,35,75.2,2576.2,0,1,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
2284,0,58,86.1,4890.5,0,1,1,0,0,1,...,0,0,1,0,0,0,1,1,0,0
845,0,2,49.6,114.7,1,0,1,0,1,0,...,0,0,0,1,1,0,0,1,0,0


In [20]:
X_val = val.drop(columns = 'churn')
y_val = val.churn

X_val = pd.get_dummies(X_val)
X_val = X_val.drop(columns = ['online_security_No internet service', 'online_backup_No internet service', 'device_protection_No internet service', 'tech_support_No internet service', 'streaming_tv_No internet service', 'streaming_movies_No internet service'])
X_val.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,partner_No,partner_Yes,dependents_No,dependents_Yes,online_security_No,online_security_Yes,...,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None
6910,0,46,20.2,845.6,0,1,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1
6044,0,40,106.0,4178.65,0,1,1,0,1,0,...,1,0,0,0,0,1,0,0,1,0
2153,0,53,25.55,1336.1,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
2089,0,70,40.05,2799.75,1,0,1,0,1,0,...,0,0,0,1,0,0,1,1,0,0
6393,0,3,96.6,291.9,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0


## Baseline

In [21]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 4930 entries, 5609 to 5679
Series name: churn
Non-Null Count  Dtype
--------------  -----
4930 non-null   int64
dtypes: int64(1)
memory usage: 77.0 KB


In [22]:
y_train.mode()

0    0
Name: churn, dtype: int64

In [23]:
(y_train == 0).mean()

0.734685598377282

In [24]:
rf = RandomForestClassifier(
                            max_depth=5, 
                            random_state=42)
rf.fit(X_train, y_train)

In [25]:
print(rf.feature_importances_)


[0.00471728 0.12678011 0.04339304 0.09027248 0.00505294 0.00345947
 0.0012649  0.00210539 0.08264158 0.01529257 0.02280331 0.00399013
 0.01820032 0.0023806  0.08750556 0.01598795 0.00381154 0.00507927
 0.00233399 0.00269052 0.00916707 0.01155356 0.00266808 0.00282312
 0.05433946 0.00284539 0.18447668 0.02237906 0.0627614  0.01721654
 0.07889616 0.01111054]


In [28]:
fi = pd.DataFrame({'feature': X_train.columns,
             'importance': rf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
26,contract_type_Month-to-month,0.184477
1,tenure,0.12678
3,total_charges,0.090272
14,tech_support_No,0.087506
8,online_security_No,0.082642
30,internet_service_type_Fiber optic,0.078896
28,contract_type_Two year,0.062761
24,payment_type_Electronic check,0.054339
2,monthly_charges,0.043393
10,online_backup_No,0.022803


In [29]:
rf.score(X_train, y_train)

0.8079107505070994

In [30]:
rf.score(X_val, y_val)

0.7916666666666666