In [1]:
#Import Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from env import host, user, password

from scipy import stats
from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

import explore as e
import wrangle as w
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression


# Acquire
* Data acquired from Codeup database
* The data set has 5 columns and 52,441 rows before cleaning
* Each row is a single property
* Each column contains information about the properties

In [2]:
df = w.get_zillow_data()

In [3]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,fips
0,4.0,3.5,3100.0,1023282.0,6059.0
1,2.0,1.0,1465.0,464000.0,6111.0
2,3.0,2.0,1243.0,564778.0,6059.0
3,4.0,3.0,2376.0,145143.0,6037.0
4,4.0,3.0,2962.0,773303.0,6037.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52441 entries, 0 to 52440
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bedroomcnt                    52441 non-null  float64
 1   bathroomcnt                   52441 non-null  float64
 2   calculatedfinishedsquarefeet  52359 non-null  float64
 3   taxvaluedollarcnt             52440 non-null  float64
 4   fips                          52441 non-null  float64
dtypes: float64(5)
memory usage: 2.0 MB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedroomcnt,52441.0,3.300681,0.949094,0.0,3.0,3.0,4.0,14.0
bathroomcnt,52441.0,2.299403,1.022772,0.0,2.0,2.0,3.0,18.0
calculatedfinishedsquarefeet,52359.0,1922.890754,1004.365838,128.0,1268.0,1659.0,2306.0,21929.0
taxvaluedollarcnt,52440.0,529688.163921,751894.553363,1000.0,193747.5,373612.0,619301.25,49061236.0
fips,52441.0,6049.132149,21.029067,6037.0,6037.0,6037.0,6059.0,6111.0


In [6]:
# Understand the size of the DataFrame
df.shape


(52441, 5)

In [7]:
df.bedroomcnt.value_counts()


3.0     23359
4.0     15240
2.0      8340
5.0      3973
6.0       635
1.0       612
0.0       137
7.0       106
8.0        24
9.0         8
12.0        3
10.0        2
14.0        1
11.0        1
Name: bedroomcnt, dtype: int64

In [8]:
df.bathroomcnt.value_counts()


2.0     21893
3.0     10673
1.0      9568
2.5      3934
4.0      2227
3.5       918
1.5       841
5.0       803
4.5       687
6.0       322
5.5       224
0.0       121
7.0        88
8.0        53
6.5        47
7.5        16
9.0        13
10.0        5
11.0        3
8.5         3
18.0        1
13.0        1
Name: bathroomcnt, dtype: int64

In [9]:
# Calculate percent of data loss from dropping nulls
(83 / 52441) * 100

0.1582731069201579

In [10]:
# See the unique values for the fips column
df.fips.unique()

array([6059., 6111., 6037.])

In [11]:
# See how many nulls are in each column
df.isnull().sum()

bedroomcnt                       0
bathroomcnt                      0
calculatedfinishedsquarefeet    82
taxvaluedollarcnt                1
fips                             0
dtype: int64

# PREPARE

### Tidy Data: 
* Drop unnecessary axis
* Rename
* Find nulls
* Drop nulls 
* Chek preperation 

In [12]:
# Clean and prepare the data
df = w.prep_zillow_data(df) 

In [13]:
df.head(5)

Unnamed: 0,bedroom,bathroom,sqft,home_value,county
0,4.0,3.5,3100,1023282.0,Orange
1,2.0,1.0,1465,464000.0,Ventura
2,3.0,2.0,1243,564778.0,Orange
3,4.0,3.0,2376,145143.0,Los Angeles
4,4.0,3.0,2962,773303.0,Los Angeles


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46765 entries, 0 to 52440
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bedroom     46765 non-null  float64
 1   bathroom    46765 non-null  float64
 2   sqft        46765 non-null  int64  
 3   home_value  46765 non-null  float64
 4   county      46765 non-null  object 
dtypes: float64(3), int64(1), object(1)
memory usage: 2.1+ MB


In [15]:
drop_nulls = df.isnull().sum()
drop_nulls


bedroom       0
bathroom      0
sqft          0
home_value    0
county        0
dtype: int64

In [16]:
df.shape


(46765, 5)

In [17]:
#Check for are any nulls in fips
df.isnull().any()

bedroom       False
bathroom      False
sqft          False
home_value    False
county        False
dtype: bool

In [18]:
df.columns[df.isnull().any()]


Index([], dtype='object')

In [19]:
df.head()


Unnamed: 0,bedroom,bathroom,sqft,home_value,county
0,4.0,3.5,3100,1023282.0,Orange
1,2.0,1.0,1465,464000.0,Ventura
2,3.0,2.0,1243,564778.0,Orange
3,4.0,3.0,2376,145143.0,Los Angeles
4,4.0,3.0,2962,773303.0,Los Angeles


In [20]:
# Split the data into train, validate, and test sets
train, validate, test = w.split_data(df)


NameError: name 'train_test_split' is not defined

# EXPLORE

In [None]:
#  Explore the senior citizen vs churn
senior=sns.countplot(data=train, x='senior_citizen', hue='churn')
senior.set_xlabel('Senior Citzen ')
senior.set_ylabel('Customer Count')
plt.title('Does being a senior citizen have a significant impact on customer churn?')
plt.show()


# Exploring and Visualizations Findings

* The visualization shows that there are more senior citizens who churned than non-senior citizens who churned.
* The visualization shows that there are more senior citizens who churned than non-senior citizens who churned.
* The proportion of senior citizens who churned is higher than the proportion of non-senior citizens who churned.
* This suggests that senior citizen status is a significant factor that can influence customer churn.

In [None]:
#Visualizing continuous data
e.plot_variable_pairs(train)

In [None]:
Exploring and Visualizations Tenure Findings

In [None]:
The scatterplot shows that there is a general trend of churn increasing with tenure.
However, there are some customers who churned early in their tenure, and some customers who did not churn even after a long tenure.
This suggests that tenure is not the only factor that can influence customer churn.

In [None]:
#Visualizing the internet service vs churn
st=sns.countplot(data=train, x='internet_service_type', hue='churn')
st.set_xlabel('internet_service_type')
st.set_ylabel('Customer Count')
plt.title('Does internet service type have a significant impact on customer churn?')
plt.show()

In [None]:
Exploring and Visualizations Internet Service Type Findings

In [None]:
The visualization shows that there are more customers with DSL who churned than customers with fiber optic or no internet service who churned.
The proportion of customers with DSL who churned is higher than the proportion of customers with fiber optic or no internet service who churned.
This suggests that internet service type is a significant factor that can influence customer churn.
Statistical Testing

In [None]:
Senior Citizen Hypothesis:
• alpha = .05 
• H0 = Senior Citizen status is independent of customer churn 
• Ha = Senior Citizen status is  dependent to customer churn  

In [None]:
alpha = 0.05
senior_observed = pd.crosstab(train.senior_citizen, train.churn)
senior_observed

In [None]:
# Example contingency table
senior_observed1 = np.array([[2514, 787],
                         [383, 259]])

In [None]:
chi2, p, dof, expected = chi2_contingency(senior_observed)

In [None]:
# Print chi-square statistic, p-value, degrees of freedom, and expected frequencies
print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:", expected)

In [None]:
# Plot the observed frequencies
fig, ax = plt.subplots()
cax = ax.matshow(senior_observed1, cmap='RdYlBu')
fig.colorbar(cax)

# Add text to each cell
for i in range(senior_observed1.shape[0]):
    for j in range(senior_observed1.shape[1]):
        ax.text(j, i, senior_observed1[i, j], va='center', ha='center')

# Set x and y axis labels
plt.xlabel('Column')
plt.ylabel('Row')

# Set plot title
plt.title('Observed Frequencies')

# Show the plot
plt.show()



                        

In [None]:
print(f'The p-value is less than the alpha: {p < alpha}')

if p < alpha:
    print('Outcome: We reject the null')
else:
    print("Outcome: We fail to reject the null")
    

In [None]:
Tenure Hypothesis:

* alpha = .05
* H0 = Tenure is independent of customer churn
* Ha = Tenure is dependent to customer churn

In [None]:
alpha = 0.05
tenure_observed = pd.crosstab(train.tenure, train.churn)
tenure_observed

In [None]:
# Example contingency table
tenure_observed1 = np.array([
    [8, 0],
    [121, 212],
    [60, 72]
])    

In [None]:
chi2, p, dof, expected = chi2_contingency(senior_observed)

In [None]:
# Print chi-square statistic, p-value, degrees of freedom, and expected frequencies
print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:", expected)

In [None]:
# Plot the observed frequencies
fig, ax = plt.subplots()
cax = ax.matshow(tenure_observed1, cmap='RdYlBu')
fig.colorbar(cax)

# Add text to each cell
for i in range(tenure_observed1.shape[0]):
    for j in range(tenure_observed1.shape[1]):
        ax.text(j, i, tenure_observed1[i, j], va='center', ha='center')

# Set x and y axis labels
plt.xlabel('Column')
plt.ylabel('Row')

# Set plot title
plt.title('Observed Frequencies')

# Show the plot
plt.show()

In [None]:
print(f'The p-value is less than the alpha: {p < alpha}')

if p < alpha:
    print('Outcome: We reject the null')
else:
    print("Outcome: We fail to reject the null")
    

In [None]:
Intenet Service Type Hypothesis:
alpha = .05
H0 = Internet Service Type is independent of customer churn
Ha = Internet Service Type is dependent to customer churn

In [None]:
alpha = 0.05
internet_observed = pd.crosstab(train.internet_service_type, train.churn)
internet_observed 


In [None]:
# Example contingency table
internet_observed1 = np.array([
    [1095, 254],
    [1019, 726],
    [783, 66]
])

In [None]:
# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(internet_observed)

In [None]:
# Print chi-square statistic, p-value, degrees of freedom, and expected frequencies
print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:", expected)

In [None]:
# Plot the observed frequencies
fig, ax = plt.subplots()
cax = ax.matshow(internet_observed1, cmap='RdYlBu')
fig.colorbar(cax)

# Add text to each cell
for i in range(internet_observed1.shape[0]):
    for j in range(internet_observed1.shape[1]):
        ax.text(j, i, internet_observed1[i, j], va='center', ha='center')

# Set x and y axis labels
plt.xlabel('Column')
plt.ylabel('Row')

# Set plot title
plt.title('Observed Frequencies')

# Show the plot
plt.show()

In [None]:
print(f'The p-value is less than the alpha: {p < alpha}')

if p < alpha:
    print('Outcome: We reject the null')
else:
    print("Outcome: we fail to reject the null")

In [None]:
Statistical Findings
Senior Citizen Hypothesis - We reject the Null Hypothesis, senior citizen status is dependent on customer churn.
Tenure Hypothesis - We reject the Null Hypothesis, tenure is dependent on customer churn.
Internet Service Hypothesis We reject the Null Hypothesis, internet service type is is dependent on customer churn.

In [None]:
MODEL

In [None]:
#Split the data in to X and Y for all datasets with churn and those without 
X_train = train.select_dtypes(exclude=['object']).drop(columns=['churn'])
y_train = train.select_dtypes(exclude=['object']).churn

X_validate = validate.select_dtypes(exclude=['object']).drop(columns=['churn'])
y_validate = validate.select_dtypes(exclude=['object']).churn

X_test = test.select_dtypes(exclude=['object']).drop(columns=['churn'])
y_test = test.select_dtypes(exclude=['object']).churn

In [None]:
Evaluate on Baseline

In [None]:
#Exploring churn The majority churn. value counts were '0', and will be use to define baseline
train.churn.value_counts()

In [None]:
baseline =(y_train==0).mean()
print(f'The baseline accuracy is: {baseline:.2%}')

In [None]:
RANDOM FOREST

In [None]:
#Make Random Forest Classifier model
random = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=40)

In [None]:
#Fit  Random Forest Classifier model 
random.fit(X_train, y_train)

In [None]:
#Use  Random Forest Classifier model 
print(f'training score: {random.score(X_train, y_train):.2%}')
print(f'validate score: {random.score(X_validate, y_validate):.2%}')
print('Random forest scores are both higher than baseline accuracy')

In [None]:
LOGISTIC REGRESSION

In [None]:
# Make Logistic Regression model
logreg = LogisticRegression(C=1, random_state=40)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)

In [None]:
print(f'logistic regression training score: {logreg.score(X_train, y_train):.2%}')
print(f'logistic regression validate score: {logreg.score(X_validate, y_validate):.2%}')
print('Logistic regression scores are both higher than baseline accuracy')

In [None]:
DECISION TREE

In [None]:
# Make, FIT, Use Decision Tree
decision = DecisionTreeClassifier(max_depth=3, random_state=40)
decision = decision.fit(X_train, y_train)

In [None]:
print(f'Decision tree training score: {decision.score(X_train, y_train):.2%}')
print(f'Decision tree validate score: {decision.score(X_validate, y_validate):.2%}')
print('Decision tree scores are both higher than baseline accuracy')

In [None]:
TOP Model Selection
Although Random Forest models scored higher than the baseline accuracy of 74%, and the scores were consistent in both training and validation models, Decision Tree and Logistic Regression models had better accuracy and consistency than Random Forest Tree, with both model sets achieving an accuracy of around 79%. The Decision Tree model was chosen.

TOP MODEL Test

In [None]:
decision.fit(X_test, y_test)
y_prediction = decision.predict(X_test)
# decision.classes_
#  decision.predict_proba(X_test) [:,1]

In [None]:
print(f'Decision tree test score: {decision.score(X_test, y_test):.2%}')
print('Train & Validate decision tree model scores were higher than baseline accuracy with a consistant accuracy of 79%. Test Decesion Tree was 81%') 

In [None]:
y_test_pred = pd.DataFrame({
    'customer_id': test['customer_id'],
    'Probability of churn': decision.predict_proba(X_test) [:,1],
    'Test Prediction': decision.predict(X_test)
})
y_test_pred.head()

In [None]:
print(classification_report(y_test, y_prediction))


In [None]:

Conclusion Summary
Three features selected based on their visual significance and chi-square statistical testing for training the Classification Model to determine their significant relationship to churn.
Senior Citizen Hypothesis - We reject the Null Hypothesis, senior citizen status is dependent on customer churn.
Tenure Hypothesis - We reject the Null Hypothesis, tenure is dependent on customer churn.
Internet Service Hypothesis We reject the Null Hypothesis, internet service type is is dependent on customer churn.
Decision Tree, Logistic Regression, and Random Forest models were implemented with a Random Seed of 40 to avoid overfitting.
The Deceision and Logistic Regression model outperformed the baseline accuracy of 73% and consistently. Decision Tree Model was selected as Top Model and achieved an accuracy of approximately 80% on the train, validate, and test sets.
Recommendations
Telco should focus on retaining senior citizens by providing them with special discounts or services.
Telco should focus on retaining customers with long tenure by providing them with loyalty programs or other incentives.
Telco should focus on improving the internet service speeds for customers with DSL orand coverage and bundle deals for customers with no internet service.
Takeaways
The factors of senior citizen status, tenure, and internet service type can influence customer churn at Telco.

Telco should focus on retaining customers who are more likely to churn by providing them with special discounts, service upgrades, or incentives

Improving these factors can reduce churn rates.