## Random Forest

In [1]:
import pandas as pd
import sqlite3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Connect to database
conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
y = data['CSAT_Score']

In [4]:
# Preprocess categorical features using OneHotEncoder
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data[['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category']])
encoded_feature_names = encoder.get_feature_names_out(['Agent Shift', 'Tenure Bucket', 'channel_name', 'category', 'Sub-category'])

# Combine numerical and encoded categorical features
X_combined = pd.concat([data[['issue_reported_hour_of_day', 'response_time_minutes', 'issue_reported_day_of_week']],
                        pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor on the combined features
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

# Print results for the Random Forest model using multiple features
print("Random Forest Model using multiple features:")
print(f'RMSE: {rmse_rf:.4f}')
print(f'R-squared: {r2_rf:.4f}')

# Print top 5 important features for the model
feature_importances = model_rf.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]
top_features = [X_combined.columns[i] for i in sorted_indices[:5]]
top_importances = feature_importances[sorted_indices[:5]]
print("\nTop 5 Important Features for the Model:")
for feature, importance in zip(top_features, top_importances):
    print(f"{feature}: {importance:.4f}")

Random Forest Model using multiple features:
RMSE: 1.5291
R-squared: 0.0113

Top 5 Important Features for the Model:
response_time_minutes: 0.3004
issue_reported_hour_of_day: 0.1970
issue_reported_day_of_week: 0.1322
Agent Shift_Morning: 0.0215
Tenure Bucket_>90: 0.0215
