C4. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np


#Read the CSV file containing churn data and store it in the churn DataFrame
churn = pd.read_csv('churn_raw_data.csv', index_col=0)

#Display information and data types 
churn.info()

: 

In [None]:
#Examine first 5 rows of data
churn.head()

: 

In [None]:
#Generate summary statistics
churn.describe()

: 

In [None]:
#Check if there are duplicate customers using Customer_id (Nkmk, n.d.).
duplicate_customer = churn[churn.duplicated(subset=['Customer_id'])]
print(duplicate_customer)

#Check for duplicate interactions
duplicate_interaction = churn[churn.duplicated(['Interaction'])]

#Print result
print(duplicate_interaction)


: 

In [None]:
#Check for null and NA values
null_values = churn.isnull().sum()
print(null_values)

: 

: 

In [None]:
# List of columns to exclude
exclude_columns = ['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item8', 'Lat', 'Lng', 'CaseOrder', 'Zip']  

# Remove the excluded columns
churn_filtered = churn.drop(columns=exclude_columns)

# Drop the NA values in the data
churn_filtered = churn_filtered.dropna() 

# Select quantitative columns
quantitative_columns = churn_filtered.select_dtypes(include=['float64', 'int64'])

# Detect outliers using summary statistics (Chaudhary, 2020)
for column in quantitative_columns.columns: 
    q1 = churn_filtered[column].quantile(0.25)
    q3 = churn_filtered[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = churn_filtered[(churn_filtered[column] < lower_bound) | (churn_filtered[column] > upper_bound)]
    if not outliers.empty:
        print(f"Statistics for {column} with outliers:")
        print(outliers[column].describe())
        print(f"Number of outliers: {len(outliers)}")
        print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
        print(f"Range: {outliers[column].max() - outliers[column].min()}")
        print("----")

# Visualize outliers using boxplots
plt.figure(figsize=(5, 10))
quantitative_columns = churn_filtered.select_dtypes(include=['float64', 'int64'])
for column in quantitative_columns.columns:
    plt.figure()
    churn_filtered.boxplot(column=[column])
    plt.title(f'Boxplot of {column}')
plt.show()

: 

In [None]:
# Filter rows where the children column is more than the population
pop_error = churn_filtered[churn_filtered['Children'] > churn_filtered['Population']]
print(pop_error)

: 

D5

In [None]:
# Impute null values with 0
remove_children=churn['Children'].fillna(0, inplace=True)
#Verify null count is 0
null_children = churn['Children'].isnull().sum()
print(null_children)

: 

In [None]:
#check shape of children column before dropping values
print("Dimensions of churn before dropping rows:", churn.shape)

#Drop values where children count exceedes population count
churn = churn.drop(churn[churn['Children'] > churn['Population']].index)

#Verify values from children were dropped
print("Dimensions of churn after dropping rows:", churn.shape)


: 

In [None]:
# Replace null values with the column's average for age
churn['Age'].fillna(churn['Age'].mean(), inplace=True)

# Replace null values with the column's average for income
churn['Income'].fillna(churn['Income'].mean(), inplace=True)

# Replace null values with the column's average for tenure
churn['Tenure'].fillna(churn['Tenure'].mean(), inplace=True)

# Replace null values with the column's average for Bandwidth_GB_Year
churn['Bandwidth_GB_Year'].fillna(churn['Bandwidth_GB_Year'].mean(), inplace=True)

# Verify null values are replaced
null_age = churn['Age'].isnull().sum()
null_income = churn['Income'].isnull().sum()
null_tenure = churn['Tenure'].isnull().sum()
null_bandwidth = churn['Bandwidth_GB_Year'].isnull().sum()

print(f"Null values in age after replacement: {null_age}")
print(f"Null values in income after replacement: {null_income}")
print(f"Null values in tenure after replacement: {null_tenure}")
print(f"Null values in Bandwidth_GB_Year after replacement: {null_bandwidth}")


: 

In [None]:
# Impute null values in the 'Techie' column with the mode value
churn['Techie'].fillna(churn['Techie'].mode()[0], inplace=True)

# Impute null values in the 'Phone' column with the mode value
churn['Phone'].fillna(churn['Phone'].mode()[0], inplace=True)

# Impute null values in the 'TechSupport' column with the mode value
churn['TechSupport'].fillna(churn['TechSupport'].mode()[0], inplace=True)

# Verification
null_techie = churn['Techie'].isnull().sum()
null_phone = churn['Phone'].isnull().sum()
null_techsupport = churn['TechSupport'].isnull().sum()

print(f"Null values in 'Techie' after replacement: {null_techie}")
print(f"Null values in 'Phone' after replacement: {null_phone}")
print(f"Null values in 'TechSupport' after replacement: {null_techsupport}")


: 

In [None]:
# Calculating the bounds using the IQR Method (Chaudhary, 2020).
column = 'Outage_sec_perweek'
q1 = churn_filtered[column].quantile(0.25)
q3 = churn_filtered[column].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Keeping a copy of the original churn df before dropping outliers
churn_before_outliers = churn_filtered.copy()

# Dropping outliers outside the lower and upper bounds
churn_filtered = churn_filtered[(churn_filtered[column] >= lower_bound) & (churn_filtered[column] <= upper_bound)]

# Verify that outliers were removed
print(f"Number of outliers in {column} before: {len(churn_before_outliers) - len(churn_filtered)}")
print(f"Number of rows after removing outliers: {len(churn_filtered)}")


: 

In [None]:
# outout the cleaned data
churn.to_csv('cleaned_churn_data.csv', index=False)

: 

In [None]:
#Peform PCA (Western Governors University, n.d.)

# Select quantitative variables
quantitative_variables = churn[['Population', 'Children', 'Age', 'Income', 'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year']]

# Normalize the data
churn_normalized = (quantitative_variables - quantitative_variables.mean()) / quantitative_variables.std()

# Define the number of components 
pca = PCA(n_components=churn_normalized.shape[1])

# Fit the PCA on the normalized data
pca.fit(churn_normalized)

# Create a new dataset of PC
churn_pca = pd.DataFrame(pca.transform(churn_normalized), columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11'])

print(churn_pca)


: 

In [None]:
#Plot for variance ratio
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('explained variance')
plt.show()

: 

In [None]:
# Compute the covariance matrix (Western Governors University, n.d.)
cov_matrix = np.dot(churn_normalized.T, churn_normalized) / churn_normalized.shape[0]

# Calculate eigenvalues using PCA components
eigenvalues = [np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)) for eigenvector in pca.components_]

#Plotting the eigenvalues
plt.plot(eigenvalues)
plt.xlabel('number of components')
plt.ylabel('eigenvalue')
plt.show() 


: 

In [None]:
#output the loadings for the components (Western Governors University, n.d.)
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11'], index=quantitative_variables.columns)
loadings


: 

: 

: 