### Importing required packages

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor

#### Connect to database

In [None]:
conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
y = data['CSAT_Score']

## Random Forest

#### Initializing plotting function

In [None]:
def plot(r2, rmse, y_pred):
    print(f'RMSE: {rmse:.4f},R2 Score: {r2}')
    
    # Plot actual vs predicted CSAT scores
    plt.scatter(data['CSAT_Score'], y_pred, alpha=1)
    plt.xlabel('Actual CSAT Scores')
    plt.ylabel('Predicted CSAT Scores')
    plt.title(f'Actual vs Predicted CSAT Scores - RMSE: {rmse:.4f}')
    plt.show()
    return rmse

#### Split and preprocess data

In [None]:
# Preprocess categorical features using OneHotEncoder
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data[['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category', 'Supervisor', 'Manager']])
encoded_feature_names = encoder.get_feature_names_out(['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category', 'Supervisor', 'Manager'])

# Combine numerical and encoded categorical features
X = pd.concat([data[['issue_reported_hour_of_day', 'response_time_minutes', 'issue_reported_day_of_week', 'sentiment_score']], pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Train a Random Forest Regressor on the combined features

In [None]:
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

feature_importances = model_rf.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]
top_features = [X.columns[i] for i in sorted_indices[:5]]
top_importances = feature_importances[sorted_indices[:5]]

#### Print top 5 important features for the model

In [None]:
print("\nTop 5 Important Features for the Model:")
for feature, importance in zip(top_features, top_importances):
    print(f"{feature}: {importance:.4f}")

#### Calculate and print results for the MLP model using multiple features 

In [None]:
y_pred_rf = model_rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print results for the Random Forest model using multiple features
print("Random Forest Model using multiple features:")
print(f'RMSE: {rmse_rf:.4f}')
print(f'MAE: {mae_rf:.4f}')
print(f'R-squared: {r2_rf:.4f}')

#### Plotting actual vs predicted results

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.title('Actual vs. Predicted Values (Random Forest)')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.grid(True)
plt.show()

#### Analyzing results

In [None]:
# TODO: Add analysis for random forest results 

## Multi-Layer Perceptron

#### Split and preprocess data

In [None]:
# Preprocess categorical features using OneHotEncoder
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data[['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category', 'Supervisor', 'Manager']])
encoded_feature_names = encoder.get_feature_names_out(['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category', 'Supervisor', 'Manager'])

# Combine numerical and encoded categorical features
X = pd.concat([data[['issue_reported_hour_of_day', 'response_time_minutes', 'issue_reported_day_of_week', 'sentiment_score']], pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Train Multi-Layer Perceptron Regressor on the combined features

In [None]:
model_mlp = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500, random_state=42)
model_mlp.fit(X_train, y_train)
y_pred_mlp = model_mlp.predict(X_test)
y_pred_mlp = np.clip(y_pred_mlp, 0, 5)

#### Calculate and print results for the MLP model using multiple features 

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
mae = mean_absolute_error(y_test, y_pred_mlp)
r2 = r2_score(y_test, y_pred_mlp)

print("MLP Model using multiple features:")
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R-squared: {r2:.4f}')

#### Plotting actual vs predicted results

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_mlp, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.title('Actual vs. Predicted Values (MLP)')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.grid(True)
plt.show()

#### Analyzing results

In [None]:
# TODO: Finish analysis for MLP results 

Comparing the line of best fit (red) to the predicted values, we can see that there was an improvement in prediction accuracy compared to Random Forest. The values here have been clipped to fit the 0-5 CSAT score range.