## Model Test

In [1]:
import numpy as np
import pandas as pd

### Reading data for Title/Body

In [None]:
# Import the intitial data
df = pd.read_csv('../soc-redditHyperlinks-title.tsv', sep='\t')

properties_df = df['PROPERTIES'].str.split(',', expand=True)

# These are already calculated sentiment values with VADER
vader_sentiments = properties_df.iloc[:, 18:21]
vader_sentiments.columns = ['Positive Sentiment', 'Negative Sentiment', 'Compound Sentiment']

# New table showing only sentiment data
result_df = df[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP']].copy()

# Add the selected sentiment columns to the new df
result_df = pd.concat([result_df, vader_sentiments], axis=1)

In [3]:
result_df.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,Positive Sentiment,Negative Sentiment,Compound Sentiment
0,rddtgaming,rddtrust,1u4pzzs,2013-12-31 16:39:18,0.0,0.0,0.0
1,xboxone,battlefield_4,1u4tmfs,2013-12-31 17:59:11,0.195,0.0,0.4404
2,ps4,battlefield_4,1u4tmos,2013-12-31 17:59:40,0.195,0.0,0.4404
3,fitnesscirclejerk,leangains,1u50xfs,2013-12-31 19:01:56,0.0,0.0,0.0
4,fitnesscirclejerk,lifeprotips,1u51nps,2013-12-31 21:02:28,0.0,0.0,0.0


In [4]:
#Print to see numebr of unique subreddits
result_df['SOURCE_SUBREDDIT'].nunique()

43695

In [None]:
# Get the unique count for each subreddit
subreddit_counts = result_df['SOURCE_SUBREDDIT'].value_counts()

# Filter for subreddits that appear 5 or fewer times
subreddits_with_5_or_less = subreddit_counts[subreddit_counts <= 1]

# Convert to a df for a cleaner display
subreddits_with_5_or_less_df = subreddits_with_5_or_less.reset_index()
subreddits_with_5_or_less_df.columns = ['Subreddit', 'Count']

In [6]:
subreddits_with_5_or_less_df

Unnamed: 0,Subreddit,Count
0,cascadingtests,1
1,dyke,1
2,meguns,1
3,medicalcoding,1
4,bestofoutrageculture,1
...,...,...
21205,donotjoin,1
21206,kaigainohannou,1
21207,dalian,1
21208,kgirls,1


In [None]:
# Import the intitial data
df_body = pd.read_csv('../soc-redditHyperlinks-body.tsv', sep='\t')

properties_df_body = df_body['PROPERTIES'].str.split(',', expand=True)

# These are already calculated sentiment values with VADER
vader_sentiments_body = properties_df_body.iloc[:, 18:21]
vader_sentiments_body.columns = ['Positive Sentiment', 'Negative Sentiment', 'Compound Sentiment']

# New table showing only sentiment data
result_df_body = df_body[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP']].copy()

# Add the selected sentiment columns to the new df
result_df_body = pd.concat([result_df_body, vader_sentiments_body], axis=1)

In [8]:
result_df_body.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,Positive Sentiment,Negative Sentiment,Compound Sentiment
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,0.073,0.08,0.1748
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,0.472,0.0,0.5538
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,0.0,0.0,0.0
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,0.153,0.026,0.9672
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,0.139,0.039,0.8906


In [9]:
result_df_body['SOURCE_SUBREDDIT'].nunique()

27863

In [None]:
subreddit_counts_body = result_df_body['SOURCE_SUBREDDIT'].value_counts()

# Filter for subreddits that appear 5 or fewer times
subreddits_with_5_or_less_body = subreddit_counts_body[subreddit_counts_body <= 1]

# Convert to a df for a cleaner display
subreddits_with_5_or_less_df_body = subreddits_with_5_or_less_body.reset_index()
subreddits_with_5_or_less_df_body.columns = ['Subreddit', 'Count']

In [11]:
subreddits_with_5_or_less_df_body

Unnamed: 0,Subreddit,Count
0,japanesestreetwear,1
1,onetruerem,1
2,eili5,1
3,rpghorrorstories,1
4,cacti,1
...,...,...
12916,highqualityreviews,1
12917,sefiefythis,1
12918,testcaseforcss,1
12919,tahrox,1


### Blocker on getting input for subreddit

test-code on API instead, awaiting issue to be fixed with reddit support

## Model Training on current data for both Title/Body

### Starting model for title

In [12]:
# Copy the processed title data
title_df = result_df.copy()

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# Check for missing values
print(title_df.isnull().sum())

# View the top 10 SOURCE_SUBREDDITs by count
top_subreddits_title = title_df['SOURCE_SUBREDDIT'].value_counts().head(10)
print("\nTop 10 SOURCE_SUBREDDITs in Title Data:")
print(top_subreddits_title)

SOURCE_SUBREDDIT      0
TARGET_SUBREDDIT      0
POST_ID               0
TIMESTAMP             0
Positive Sentiment    0
Negative Sentiment    0
Compound Sentiment    0
dtype: int64

Top 10 SOURCE_SUBREDDITs in Title Data:
subredditdrama      22971
bestof              21170
titlegore            9500
shitredditsays       7338
shitpost             6657
circlebroke2         6037
switcharoo           5997
shitamericanssay     5482
drama                5478
shitstatistssay      4240
Name: SOURCE_SUBREDDIT, dtype: int64


In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# possible? 
from sklearn.model_selection import GridSearchCV

In [24]:
# Encode data in order to put into model
label_encoder_title = LabelEncoder()
title_df['SOURCE_SUBREDDIT_ENCODED'] = label_encoder_title.fit_transform(title_df['SOURCE_SUBREDDIT'])

In [None]:
# Split data into x,y (80/20 split)
X_title = title_df[['SOURCE_SUBREDDIT_ENCODED', 'Positive Sentiment', 'Negative Sentiment']]
y_title = title_df['Compound Sentiment']

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y_title, test_size=0.2, random_state=42)

In [None]:
# Create the model
model_title = RandomForestRegressor(n_estimators=100, random_state=42)
model_title.fit(X_train_title, y_train_title)

In [None]:
# Predict and print off mse and r2 values for comparison
y_pred_title = model_title.predict(X_test_title)

mse_title = mean_squared_error(y_test_title, y_pred_title)
r2_title = r2_score(y_test_title, y_pred_title)

print(f"Title Data - Mean Squared Error: {mse_title}")
print(f"Title Data - R^2 Score: {r2_title}")

Title Data - Mean Squared Error: 0.013046367434664332
Title Data - R^2 Score: 0.9082322853705096


## model for body(message)

In [28]:
# Copy the processed body data
body_df = result_df_body.copy()

In [30]:
# Check for missing values
print(body_df.isnull().sum())

# View the top 10 SOURCE_SUBREDDITs by count
top_subreddits_body = body_df['SOURCE_SUBREDDIT'].value_counts().head(10)
print("\nTop 10 SOURCE_SUBREDDITs in Body Data:")
print(top_subreddits_body)

SOURCE_SUBREDDIT      0
TARGET_SUBREDDIT      0
POST_ID               0
TIMESTAMP             0
Positive Sentiment    0
Negative Sentiment    0
Compound Sentiment    0
dtype: int64

Top 10 SOURCE_SUBREDDITs in Body Data:
subredditdrama          4665
circlebroke             2358
shitliberalssay         1968
outoftheloop            1958
copypasta               1824
writingprompts          1707
hailcorporate           1560
circlejerkcopypasta     1517
conspiracy              1462
bestofoutrageculture    1354
Name: SOURCE_SUBREDDIT, dtype: int64


In [31]:
# Encode data in order to put into model
label_encoder_body = LabelEncoder()
body_df['SOURCE_SUBREDDIT_ENCODED'] = label_encoder_body.fit_transform(body_df['SOURCE_SUBREDDIT'])

In [33]:
# Split data into x,y (80/20 split)
X_body = body_df[['SOURCE_SUBREDDIT_ENCODED', 'Positive Sentiment', 'Negative Sentiment']]
y_body = body_df['Compound Sentiment']

X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(X_body, y_body, test_size=0.2, random_state=42)

In [34]:
# Create the model
model_body = RandomForestRegressor(n_estimators=100, random_state=42)
model_body.fit(X_train_body, y_train_body)

In [35]:
# Predict and print off mse and r2 values for comparison
y_pred_body = model_body.predict(X_test_body)
mse_body = mean_squared_error(y_test_body, y_pred_body)
r2_body = r2_score(y_test_body, y_pred_body)

print(f"Body Data - Mean Squared Error: {mse_body}")
print(f"Body Data - R^2 Score: {r2_body}")

Body Data - Mean Squared Error: 0.042810763943084244
Body Data - R^2 Score: 0.9041332439796781


## Prediction for both models

In [43]:
def predict_compound_sentiment_title(subreddit_name, positive_sentiment=0.0, negative_sentiment=0.0):
    # Encode the subreddit name
    subreddit_encoded = label_encoder_title.transform([subreddit_name])[0]
    # Prepare the feature vector
    features = [[subreddit_encoded, positive_sentiment, negative_sentiment]]
    # Predict
    prediction = model_title.predict(features)
    return prediction[0]

In [44]:
# example
new_subreddit = 'leagueoflegends' 
predicted_sentiment_title = predict_compound_sentiment_title(new_subreddit, positive_sentiment=0.1, negative_sentiment=0.05)
print(f"Predicted Compound Sentiment (Title Data) for {new_subreddit}: {predicted_sentiment_title}")

Predicted Compound Sentiment (Title Data) for leagueoflegends: 0.4038129999999997




In [41]:
def predict_compound_sentiment_body(subreddit_name, positive_sentiment=0.0, negative_sentiment=0.0):
    # Encode the subreddit name
    subreddit_encoded = label_encoder_body.transform([subreddit_name])[0]
    # Prepare the feature vector
    features = [[subreddit_encoded, positive_sentiment, negative_sentiment]]
    # Predict
    prediction = model_body.predict(features)
    return prediction[0]

In [42]:
# example
new_subreddit = 'leagueoflegends'
predicted_sentiment_body = predict_compound_sentiment_body(new_subreddit, positive_sentiment=0.1, negative_sentiment=0.05)
print(f"Predicted Compound Sentiment (Body Data) for {new_subreddit}: {predicted_sentiment_body}")

Predicted Compound Sentiment (Body Data) for leagueoflegends: 0.779230142857143




## VADER sentiment analysis

In [45]:
import nltk
import praw
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ernestorivera/nltk_data...


True

In [46]:
sia = SentimentIntensityAnalyzer()