In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


#import clean data
df = pd.read_csv("C:/Users/jhall/Desktop/D209/churn_clean.csv")

#inspect data
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CaseOrder             10000 non-null  int64  
 1   Customer_id           10000 non-null  object 
 2   Interaction           10000 non-null  object 
 3   UID                   10000 non-null  object 
 4   City                  10000 non-null  object 
 5   State                 10000 non-null  object 
 6   County                10000 non-null  object 
 7   Zip                   10000 non-null  int64  
 8   Lat                   10000 non-null  float64
 9   Lng                   10000 non-null  float64
 10  Population            10000 non-null  int64  
 11  Area                  10000 non-null  object 
 12  TimeZone              10000 non-null  object 
 13  Job                   10000 non-null  object 
 14  Children              10000 non-null  int64  
 15  Age                 

In [2]:
# Convert 'Yes' to 1 and 'No' to 0
columns_to_convert = [
    'Churn','Techie', 'Contract', 'Port_modem', 'Tablet', 
    'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'PaperlessBilling',
]

for column in columns_to_convert:
    df[column] = df[column].apply(lambda x: 1 if x == 'Yes' else 0)

In [3]:
# Selecting a subset of columns that are relevant for churn 
selected_columns = [
    'Age', 'Income', 'Children', 'Outage_sec_perweek', 'Email', 'Contacts', 
    'Yearly_equip_failure', 'Techie', 'Contract', 'Port_modem', 'Tablet', 
    'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'PaperlessBilling', 'PaymentMethod', 'MonthlyCharge', 'Bandwidth_GB_Year','Churn'
]

# Filtering the dataset to keep only the selected columns
df_filtered = df[selected_columns]


In [4]:
# Encoding categorical columns using pd.get_dummies
categorical_columns = df_filtered.select_dtypes(include=['object']).columns
df_filtered_encoded = pd.get_dummies(df_filtered, columns=categorical_columns)


In [5]:
# Separating the features and the target variable
X = df_filtered_encoded.drop('Churn', axis=1)
y = df_filtered_encoded['Churn']

# Applying standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [6]:
# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [7]:
#Exporting dfs
df_filtered_encoded.to_csv('C:/Users/jhall/Desktop/D209/D209 Part 2 DT.csv', index=False)

# Convert the scaled features back into a DataFrame
X_train_df = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
X_test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])

# Add the labels to the features DataFrame
train_df = pd.concat([X_train_df, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_df, y_test.reset_index(drop=True)], axis=1)

# Export to CSV
train_df.to_csv('C:/Users/jhall/Desktop/D209/D209 Part 2 Train.csv', index=False)
test_df.to_csv('C:/Users/jhall/Desktop/D209/D209 Part 2 Test.csv', index=False)


In [8]:
# Creating the decision tree 
model = DecisionTreeClassifier(random_state=42)

# Training the model
model.fit(X_train, y_train)


In [9]:
#Making predictions
y_pred = model.predict(X_test)

#Evaluating
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy",accuracy)
print(report)

Accuracy 0.793
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      1456
           1       0.62      0.60      0.61       544

    accuracy                           0.79      2000
   macro avg       0.74      0.73      0.74      2000
weighted avg       0.79      0.79      0.79      2000



In [10]:
# Calculating the MSE
mse = mean_squared_error(y_test, y_pred)

mse

0.207