In [8]:
#============================= LINEAR REGRESSION BASELINE MODEL =============================#
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import joblib
import os

# Load data
df = pd.read_csv('/content/dataset_cleaned.csv')

# Feature engineering
df['log_followers'] = np.log1p(df['followers_count'])

# Features and target
features = ['log_followers', 'sentiment', 'tweet_length', 'hashtag_count']
df = df[features + ['number_of_likes']].dropna()
df = df[np.isfinite(df['number_of_likes'])]

# Split
X = df[features]
y = df['number_of_likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression
lr_model = LinearRegression()
# Measure training time
start_time = time.time()
lr_model.fit(X_train, y_train)
train_time = time.time() - start_time

# Measure inference time + predict
start_time = time.time()
y_pred = lr_model.predict(X_test)
inference_time = time.time() - start_time
inference_time_per_sample = (inference_time / len(X_test)) * 1000  # ms per sample
# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n📊 Linear Regression Baseline Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.4f}")

# Coefficients (normalized importance)
coef = lr_model.coef_
coef_df = pd.DataFrame({'feature': features, 'coefficient': coef})
coef_df['importance_pct'] = np.abs(coef_df['coefficient']) / np.sum(np.abs(coef_df['coefficient'])) * 100

print("\n--- Feature Importance (based on coefficient magnitude, normalized %) ---")
for _, row in coef_df.iterrows():
    print(f"{row['feature']}: {row['importance_pct']:.2f}%")




print("\n--- System Metrics ---")

print(f"Training time: {train_time:.2f} seconds")
print(f"Inference time per sample: {inference_time_per_sample:.4f} ms")



# Save model to disk
joblib.dump(lr_model, 'model_filename.pkl')

# Get size in MB
model_size_bytes = os.path.getsize('model_filename.pkl')
model_size_mb = model_size_bytes / (1024 * 1024)

print(f"Model size: {model_size_mb:.2f} MB")



📊 Linear Regression Baseline Results:
RMSE: 6225.75
MAE: 2201.06
R²: 0.1880

--- Feature Importance (based on coefficient magnitude, normalized %) ---
log_followers: 69.04%
sentiment: 12.32%
tweet_length: 2.80%
hashtag_count: 15.83%

--- System Metrics ---
Training time: 0.02 seconds
Inference time per sample: 0.0001 ms
Model size: 0.00 MB


  df = pd.read_csv('/content/dataset_cleaned.csv')


In [17]:
# 🚀 CPU RandomForestRegressor (sklearn version - full metrics)
print("🚀 OPTION 2: Sklearn CPU Random Forest (Baseline)")

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
import time
import os
import joblib

# Load data
df = pd.read_csv("/content/dataset_cleaned.csv")
df_clean = df.dropna(subset=['followers_count', 'sentiment', 'number_of_likes', 'tweet_length', 'hashtag_count'])
df_clean = df_clean[(df_clean['followers_count'] > 0) & (df_clean['number_of_likes'] >= 0)]

# Add log_followers
df_clean['log_followers'] = np.log1p(df_clean['followers_count'])

# Features and target
features = ['log_followers', 'sentiment', 'tweet_length', 'hashtag_count']
X = df_clean[features]
y = df_clean['number_of_likes']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build model
cpu_rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Training time
train_start = time.time()
print("Training CPU Random Forest...")
cpu_rf_model.fit(X_train, y_train)
train_time = time.time() - train_start

# Inference time
inference_start = time.time()
y_pred = cpu_rf_model.predict(X_test)
inference_time = time.time() - inference_start
inference_time_per_sample = (inference_time / len(X_test)) * 1000  # ms per sample

# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Feature importance
feature_importances = cpu_rf_model.feature_importances_
total_importance = np.sum(feature_importances)
importance_pct = feature_importances / total_importance * 100
importance_dict = dict(zip(features, importance_pct))

# Print full results
print(f"\n=== Sklearn CPU Random Forest Results ===")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"Training time: {train_time:.2f} s")
print(f"Inference time per sample: {inference_time_per_sample:.3f} ms")
print(f"Feature Importances (% of total):")
for feature, pct in importance_dict.items():
    print(f"  {feature}: {pct:.2f}%")

# Save model
model_filename = 'cpu_rf_model.pkl'
joblib.dump(cpu_rf_model, model_filename)
model_size_bytes = os.path.getsize(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)

# Final summary for report
print("\n=== FINAL SUMMARY FOR REPORT ===")
print(f"Model: Sklearn CPU Random Forest")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"Training time: {train_time:.2f} s")
print(f"Inference time per sample: {inference_time_per_sample:.3f} ms")
print(f"Model size (pickled): {model_size_mb:.2f} MB")
print(f"Sentiment importance: {importance_dict.get('sentiment', 0):.2f}%")
print(f"Followers importance: {importance_dict.get('log_followers', 0):.2f}%")


🚀 OPTION 2: Sklearn CPU Random Forest (Baseline)
Training CPU Random Forest...

=== Sklearn CPU Random Forest Results ===
R²: 0.3460
RMSE: 5999.90
MAE: 941.53
Training time: 53.28 s
Inference time per sample: 0.020 ms
Feature Importances (% of total):
  log_followers: 79.20%
  sentiment: 7.57%
  tweet_length: 10.12%
  hashtag_count: 3.11%

=== FINAL SUMMARY FOR REPORT ===
Model: Sklearn CPU Random Forest
R²: 0.3460
RMSE: 5999.90
MAE: 941.53
Training time: 53.28 s
Inference time per sample: 0.020 ms
Model size (pickled): 32.10 MB
Sentiment importance: 7.57%
Followers importance: 79.20%
