In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

#--------------------------
# Load datasets
emails = pd.read_csv(r"C:/Users/linda/Desktop/labeled_textblob.csv")
monthly_scores = pd.read_csv(r"C:/Users/linda/Desktop/monthly_sentiment_scores.csv")

#--------------------------
# Preprocessing: Date, Identifiers, Lowercasing
emails['date'] = pd.to_datetime(emails['date'], format='%Y/%m/%d')
emails['month'] = emails['date'].dt.to_period('M').astype(str)
emails['employee'] = emails['from'].str.strip().str.lower()
emails['message'] = emails['message'].fillna("")
emails['message_lower'] = emails['message'].str.lower()


# Feature Engineering
emails['message_length'] = emails['message'].str.len()
emails['word_count'] = emails['message'].str.split().apply(len)

# Keyword-based features
keywords = ['thank you', 'sorry', 'best regards']
for phrase in keywords:
    col = phrase.replace(" ", "_") + "_count"
    emails[col] = emails['message_lower'].str.count(re.escape(phrase))

#----------------------------
# Aggregate monthly features per employee
monthly_features = emails.groupby(['employee', 'month']).agg(
    message_count=('message', 'count'),
    avg_word_count=('word_count', 'mean'),
).reset_index()

# Aggregate keyword counts
keyword_cols = [phrase.replace(" ", "_") + "_count" for phrase in keywords]
keyword_agg = emails.groupby(['employee', 'month'])[keyword_cols].sum().reset_index()

# Merge all monthly features
monthly_features = pd.merge(monthly_features, keyword_agg, on=['employee', 'month'], how='left')
monthly_features.fillna(0, inplace=True)

#-----------------------------
# Prepare target data
monthly_scores['employee'] = monthly_scores['employee'].str.strip().str.lower()
monthly_scores['month'] = pd.to_datetime(monthly_scores['month']).dt.to_period('M').astype(str)

# Merge features with sentiment scores
data = pd.merge(monthly_features, monthly_scores, on=['employee', 'month'], how='inner')

#------------------------------
# Model Training and Evaluation
X = data[['message_count', 'avg_word_count', 'thank_you_count', 'sorry_count', 'best_regards_count']]
y = data['monthly_sentiment_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate performance
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Model coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
print(coefficients)

#--------------------------
# Visualize Residuals
residuals = y_test - y_pred

plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.title("Residuals vs Predicted Sentiment Scores")
plt.xlabel("Predicted Sentiment Score")
plt.ylabel("Residual (Actual - Predicted)")
plt.grid(True)
plt.tight_layout()
plt.show()

# --------------------------
# 2. Cross-Validation (R²)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')

print("\nCross-Validation R-squared Scores:", cv_scores)
print("Average R-squared from CV: {:.4f}".format(np.mean(cv_scores)))
