## Data preparation and cleaning



In [61]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import  mean_squared_error, r2_score
#Connect to database
conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
y = data['CSAT_Score']

In [62]:
#First going to evaluate the hour when the issue was reported as well as the response time 
X1 = data[['issue_reported_hour_of_day']]
X2 = data[['response_time_minutes']]
#First model based on issue_reported_hour_of_day
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
model1 = LinearRegression()
model1.fit(X1_train, y_train)
y_pred1 = model1.predict(X1_test)
mse1 = mean_squared_error(y_test, y_pred1)
r2_1 = r2_score(y_test, y_pred1)
#Printing relevant data for issue_reported_hour_of_day
print("Model using 'issue_reported_hour_of_day':")
print(f'Mean Squared Error: {mse1}')
print(f'R-squared: {r2_1}')
examples = X1_test.copy()  
examples['Actual_CSAT_Score'] = y_test 
examples['Predicted_CSAT_Score'] = y_pred1  
print(examples.head(10))  
# Second model based on response_time_minutes
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)
model2 = LinearRegression()
model2.fit(X2_train, y_train)
y_pred2 = model2.predict(X2_test)
mse2 = mean_squared_error(y_test, y_pred2)
r2_2 = r2_score(y_test, y_pred2)
#Printing relevant data for response_time_minutes
print("\nModel using 'response_time_minutes':")
print(f'Mean Squared Error: {mse2}')
print(f'R-squared: {r2_2}')
examples = X2_test.copy()  
examples['Actual_CSAT_Score'] = y_test 
examples['Predicted_CSAT_Score'] = y_pred2  
print(examples.head(10))  

Model using 'issue_reported_hour_of_day':
Mean Squared Error: 2.3671577053105173
R-squared: -0.0009673798417888513
       issue_reported_hour_of_day  Actual_CSAT_Score  Predicted_CSAT_Score
3394                           13                  4              4.062751
23665                          19                  5              4.071607
8895                           21                  1              4.074559
13218                           9                  4              4.056847
12382                          13                  5              4.062751
2932                            0                  1              4.043563
20001                          18                  5              4.070131
26043                          21                  5              4.074559
3023                            7                  5              4.053895
5102                           13                  5              4.062751

Model using 'response_time_minutes':
Mean Squared Error: 2.

In [81]:
# Third model based on Agent Shift which needs one hot encoder to preprocess the data since the shifts are split into known ones
encoder = OneHotEncoder() 
X3_encoded = encoder.fit_transform(data[['Agent Shift']])
shift_categories = encoder.categories_[0] 
print(shift_categories)
# Train model
X3_train, X3_test, y_train, y_test = train_test_split(X3_encoded, y, test_size=0.2, random_state=42)
model3 = LinearRegression()
model3.fit(X3_train, y_train)
y_pred3 = model3.predict(X3_test)
mse3 = mean_squared_error(y_test, y_pred3)
r2_3 = r2_score(y_test, y_pred3)
mapped_categories = {int(i): category[0] for i, category in enumerate(shift_categories)}

# Convert test data to DataFrame and map encoded categories to their labels
X3_test_df = pd.DataFrame.sparse.from_spmatrix(X3_test)
X3_test_df = X3_test_df.rename(columns=mapped_categories)
X3_test_df = X3_test_df.astype(int)

X3_test_df.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
examples = X3_test_df.copy()  
examples['Actual_CSAT_Score'] = y_test 
examples['Predicted_CSAT_Score'] = y_pred3 

print(examples.head(10))




['Afternoon' 'Evening' 'Morning' 'Night' 'Split']
   A  E  M  N  S  Actual_CSAT_Score  Predicted_CSAT_Score
0  0  0  1  0  0                  4              4.001691
1  0  0  1  0  0                  5              4.001691
2  1  0  0  0  0                  1              4.137886
3  0  0  1  0  0                  4              4.001691
4  0  1  0  0  0                  5              4.096031
5  0  1  0  0  0                  1              4.096031
6  0  1  0  0  0                  5              4.096031
7  0  1  0  0  0                  5              4.096031
8  0  0  0  0  1                  5              4.338692
9  0  0  1  0  0                  5              4.001691
