In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

# Loading file
df = pd.read_csv('churn_clean.csv')

#Relevant coumns
selected_columns = ['Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Population', 'Age', 'Income', 'Marital', 'Gender', 'Churn', 'Port_modem', 'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod', 'Techie']

selected_describe = df[selected_columns].describe()
print(selected_describe)


       Outage_sec_perweek         Email      Contacts  Yearly_equip_failure  \
count        10000.000000  10000.000000  10000.000000          10000.000000   
mean            10.001848     12.016000      0.994200              0.398000   
std              2.976019      3.025898      0.988466              0.635953   
min              0.099747      1.000000      0.000000              0.000000   
25%              8.018214     10.000000      0.000000              0.000000   
50%             10.018560     12.000000      1.000000              0.000000   
75%             11.969485     14.000000      2.000000              1.000000   
max             21.207230     23.000000      7.000000              6.000000   

             Tenure  MonthlyCharge  Bandwidth_GB_Year     Population  \
count  10000.000000   10000.000000       10000.000000   10000.000000   
mean      34.526188     172.624816        3392.341550    9756.562400   
std       26.443063      42.943094        2185.294852   14432.698671   


In [2]:
#C1

# Numeric
numeric_cols = ['Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Population', 'Age', 'Income']

#categorical 
categorical_cols = ['Marital', 'Gender', 'Churn', 'Port_modem', 'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod', 'Techie']


# Encoding categorical variables
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])


# Scaling numeric values
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#C4

# Save
df.to_csv('Transformed_Data.csv', index=False)


In [5]:
# D2


# Relevant columns 
relevant_columns = ['Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Population', 'Age', 'Income', 'Marital', 'Gender', 'Churn', 'Port_modem', 'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod', 'Techie']

# Define x and y
X = df[relevant_columns].drop('Churn', axis=1) 
y = df['Churn']  

# Split the dataset into 70% training and 30% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#
# Save
X_train.to_csv('train_data.csv', index=False)
X_test.to_csv('test_data.csv', index=False)


In [13]:
# D3


# Random forest classifier (Scikit, n.d.)
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict using the trained model on the test data
y_pred = rf_classifier.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Obtain predicted probabilities for the churn
y_pred_prob = rf_classifier.predict_proba(X_test)[:, 1] 
y_pred_regression = np.round(y_pred_prob)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred_regression)
print(f"Mean Squared Error (MSE): {mse:.4f}")



Accuracy: 0.8463

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      2156
           1       0.80      0.60      0.69       844

    accuracy                           0.85      3000
   macro avg       0.83      0.77      0.79      3000
weighted avg       0.84      0.85      0.84      3000


Confusion Matrix:
[[2029  127]
 [ 334  510]]
Mean Squared Error (MSE): 0.1537
