In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the prepared dataset
data_path = 'tweets_with_calculated_features.xlsx'
data = pd.read_excel(data_path)

# Define features (X) and target variables (y)
X = data[['sentiment', 'text_length', 'hashtag_count', 'mention_count']]
y_retweet = data['retweet_count']  # Model for retweets
y_favorite = data['favorite_count']  # Model for favorites

# Split the data for retweets
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(X, y_retweet, test_size=0.2, random_state=42)

# Train the model for retweet count
retweet_model = LinearRegression()
retweet_model.fit(X_train_rt, y_train_rt)

# Predict and evaluate for retweets
y_pred_rt = retweet_model.predict(X_test_rt)
mse_rt = mean_squared_error(y_test_rt, y_pred_rt)
r2_rt = r2_score(y_test_rt, y_pred_rt)

print("Retweet Model Performance:")
print("Mean Squared Error:", mse_rt)
print("R-squared:", r2_rt)

# Repeat for favorites
X_train_fav, X_test_fav, y_train_fav, y_test_fav = train_test_split(X, y_favorite, test_size=0.2, random_state=42)

# Train the model for favorite count
favorite_model = LinearRegression()
favorite_model.fit(X_train_fav, y_train_fav)

# Predict and evaluate for favorites
y_pred_fav = favorite_model.predict(X_test_fav)
mse_fav = mean_squared_error(y_test_fav, y_pred_fav)
r2_fav = r2_score(y_test_fav, y_pred_fav)

print("\nFavorite Model Performance:")
print("Mean Squared Error:", mse_fav)
print("R-squared:", r2_fav)


Retweet Model Performance:
Mean Squared Error: 528844066.0011776
R-squared: -2.610538444673564

Favorite Model Performance:
Mean Squared Error: 1614002398.6463861
R-squared: -1.2477063011493859
