# Exploring Telco Data

# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
import wrangle as w


In [2]:
# get telco data
df = w.get_telco()
df =  w.wrangle_telco_encoded(df)

In [11]:
# split the data
train, validate, test = w.train_validate_test(df, 'churn')
train

Unnamed: 0,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,churn,gender_Male,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
5911,0,0,1,26,0,1,39.95,1023.75,0,1,...,1,0,0,1,0,0,0,0,0,0
2479,0,1,0,47,1,0,26.90,1250.85,0,0,...,0,1,0,1,0,0,1,0,1,0
5889,0,0,0,1,1,1,50.45,50.45,1,0,...,0,0,0,0,0,0,0,0,0,1
6087,0,0,0,69,0,1,60.05,4176.70,0,1,...,1,0,1,0,1,0,0,0,0,0
785,0,0,0,51,1,0,87.55,4475.90,0,0,...,1,0,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6102,0,1,0,2,1,0,79.70,165.00,1,0,...,0,0,0,0,0,1,0,0,0,0
356,1,0,0,52,0,1,50.50,2566.30,0,0,...,1,0,0,0,0,0,0,0,1,0
1756,0,0,0,4,1,1,70.20,237.95,1,1,...,0,0,0,0,0,1,0,0,1,0
2127,0,0,0,35,1,0,55.25,1924.10,0,1,...,0,0,0,0,1,0,0,0,0,1


# Is fiber optic a driver of churn?
$H_0$: Customers with fiber optic do not have a higher churn rate than those with DSL.
$H_a$: Customers with fiber optic have a higher churn rate than those with DSL.

KeyError: 'internet_service_type_DSL'

In [9]:
fiber_vs_dsl

0       0
1       0
2       1
3       1
4       1
       ..
7038    0
7039    1
7040    0
7041    0
7042    0
Name: internet_service_type_Fiber optic, Length: 7043, dtype: uint8

In [None]:
#create function for wrangle.py
def plot_churn_rate_by_internet_service_type(df):
    fiber_vs_dsl = df[df.internet_service_type != 'None']
    sns.barplot(x='internet_service_type', y='churn', data=fiber_vs_dsl)
    # Label the plot
    plt.title('Churn Rate by Internet Service Type')
    plt.xlabel('Internet Service Type')
    plt.ylabel('Churn Rate')
    plt.show()

In [None]:
# statistical test
# chi2 test to compare the proportions of two groups
from scipy import stats
observed = pd.crosstab(fiber_vs_dsl.churn, fiber_vs_dsl.internet_service_type)
observed

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

In [None]:
w.eval_results(p)

In [None]:
# Create a function for the chi2 test for churn and internet service type
def chi2_test_for_churn_and_internet_service_type(df):
    observed = pd.crosstab(df.churn, df.internet_service_type)
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    print('Observed\n')
    print(observed.values)
    print('---\nExpected\n')
    print(expected)
    print('---\n')
    print(f'chi^2 = {chi2:.4f}')
    print(f'p     = {p:.4f}')
    if p < 0.05:
        print("We reject the null hypothesis")
    else:
        print("We fail to reject the null hypothesis")

### Takeaway: Fiber optic customers have a higher churn rate than DSL customers.

# Is there a price threshold for fiber optic where customers are more likely to churn?

$h_0$: Customers with fiber optic who pay more than the average monthly charge do not have a higher churn rate than those who pay less than the average monthly charge.
$h_a$: Customers with fiber optic who pay more than the average monthly charge have a higher churn rate than those who pay less than the average monthly charge.

In [None]:
# create a new column that indicates whether or not the customer pays more than the average monthly charge
df['above_avg_monthly_charge'] = df.monthly_charges > df.monthly_charges.mean()
df.monthly_charges.mean()

In [None]:
df.groupby('above_avg_monthly_charge').churn.mean()

In [None]:
# plot the churn rate for each internet service type
sns.barplot(x='above_avg_monthly_charge', y='churn', data=df)
# Label the plot
plt.title('Churn Rate by Above Average Monthly Charge')
plt.xlabel('Above Average Monthly Charge of $64.76')
plt.ylabel('Churn Rate')
plt.show()

In [None]:
# statistical test
# chi2 test to compare the proportions of two groups
observed = pd.crosstab(df.churn, df.above_avg_monthly_charge)
observed

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

### Takeaway: Customers with fiber optic who pay more than the average monthly charge have a higher churn rate than those who pay less than the average monthly charge.

In [None]:
# Is there a price threshold for fiber optic where customers are more likely to churn?
# when we look at the distribution of monthly charges, we see that there is a spike at 20 and 25 dollars
