In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/cleaned_data_week1.csv")

In [35]:
from textblob import TextBlob

In [36]:
df['sentiment'] = df["content"].apply(lambda x: TextBlob(x).sentiment.polarity)

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

X = df[['word_count', 'char_count', 'has_media', 'hour', 'sentiment']]
y = df['likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds)**0.5
print("RMSE:", rmse)

RMSE: 3694.1616451817404


In [38]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
encodedMatrix = ohe.fit_transform(df[["day_of_week"]])

In [39]:
encodedMatrix.toarray()

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [40]:
ohe.get_feature_names_out()

array(['day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday'], dtype=object)

In [41]:
weekdayDf = pd.DataFrame(encodedMatrix.toarray(), columns=ohe.get_feature_names_out())

In [42]:
import math
X = df[['word_count', 'char_count', 'has_media', 'hour', 'sentiment']]
X = pd.concat([X, weekdayDf], axis=1)
y = df["likes"].apply(lambda x : np.log1p(x))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

preds = np.expm1(model.predict(X_test))
y_test_final = np.expm1(y_test)
rmse = mean_squared_error(y_test_final, preds)**(0.5)
print("RMSE:", rmse)

RMSE: 3365.9743373044985


In [43]:
df['company_avg_likes'] = df.groupby('inferred company')['likes'].transform('mean')

In [51]:
import math
X = df[['word_count', 'char_count', 'has_media', 'hour', 'sentiment', 'company_avg_likes']]
X = pd.concat([X, weekdayDf], axis=1)
y = df["likes"].apply(lambda x : np.log1p(x))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

preds = np.expm1(model.predict(X_test))
y_test_final = np.expm1(y_test)
rmse = mean_squared_error(y_test_final, preds)**(0.5)
print("RMSE:", rmse)

RMSE: 2978.35490063556


In [52]:
import joblib
joblib.dump(model, 'like_predictor3.pkl')

['like_predictor3.pkl']

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

X_tfidf = tfidf.fit_transform(df['content'])


In [46]:
from scipy.sparse import hstack

In [47]:
import math
X_ = df[['word_count', 'char_count', 'has_media', 'hour', 'sentiment', 'company_avg_likes']]
X_ = pd.concat([X_, weekdayDf], axis=1)
X_.astype(float)
y = df["likes"].apply(lambda x : np.log1p(x))
X = pd.concat([X_, pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

preds = np.expm1(model.predict(X_test))
y_test_final = np.expm1(y_test)
rmse = mean_squared_error(y_test_final, preds)**(0.5)
print("RMSE:", rmse)

RMSE: 2943.95948515631


In [None]:
import joblib
joblib.dump(model, 'like_predictor2.pkl')