## Sentiment Analysis on Scraped Reddit

#### 1. Retrieve sentiment analyzed data from CSV

- The sentiment analysis of scraped reddit posts were done with Openai GPT-4 model.
- Prompt engineering was conducted to perform analysis on a single player which his FULL NAME was mentioned in the post
- The result is in a scale of -1 to 1, -1 being very negative and 1 being very positive.

In [1]:
import pandas as pd
import plotly.express as px


df = pd.read_csv('sentiment_analysis_results.csv')
df['created_date'] = pd.to_datetime(df['created_date'])
df = df.query('created_date.dt.year >= 2024')
df

Unnamed: 0,_id,created_date,score,combined_text,mentioned_players,mentioned_players_count,sentiment_result
0,6605c7cec120fd154e58655f,2024-03-28,0,"McDavid's case for the Hart trophy\nFirst off,...",Johnny Gaudreau,1,0.1
1,6605c7d07bcaf9812bb4a654,2024-03-27,1876,Sidney Crosby only made the NHL because his pa...,Sidney Crosby,1,-0.6
2,6605c7d07bcaf9812bb4a666,2024-03-27,51,"As requested by my Oilers fans, Here’s my late...",Zach Hyman,1,1.0
3,6605c7d2246827bb414386ae,2024-03-27,549,Jack Hughes buries the Leafs\n,Jack Hughes,1,1.0
4,6605c7d31ee4e9142a3782af,2024-03-27,178,Sidney Crosby has 7 points in the last 2 games...,Sidney Crosby,1,1.0
...,...,...,...,...,...,...,...
155,660f8e4163f0defdfbd7387e,2024-04-03,336,Slafkovsky on Nick Cousins’ cheapshot behavior\n,Nick Cousins,1,-1.0
156,6610d505ecf4d0160a2a9cb1,2024-04-06,12,Shane Wright scores back to back goals\n,Shane Wright,1,1.0
157,6610d5065b9561631f777fa6,2024-04-06,25,Zach Benson’s stick explodes as he tries to en...,Zach Benson,1,-0.2
158,6610d5074ad0f26c75e0d949,2024-04-05,55,New Artemi Panarin reference just dropped\n,Artemi Panarin,1,0.0


#### 2. Analyze sentiment score of all players

- Grouped by player name, and sorted by number of posts mentioned for each player DESC is as below.
- Average sentiment score is calculated for each player.

In [141]:
player_grouped_df = df.groupby('mentioned_players').agg(
    count=('sentiment_result', 'size'),
    average_sentiment=('sentiment_result', 'mean')
).reset_index().rename(columns={'mentioned_players': 'name'}).sort_values(by='count', ascending=False)

player_grouped_df

Unnamed: 0,name,count,average_sentiment
64,Sidney Crosby,14,0.392857
4,Auston Matthews,5,0.880000
47,Matt Rempe,5,-0.100000
52,Nikita Kucherov,4,0.275000
50,Nick Cousins,4,-0.775000
...,...,...,...
43,Kaiden Guhle,1,-0.500000
44,Kevin Hayes,1,1.000000
45,Matt Dumba,1,0.000000
46,Matt Grzelcyk,1,0.000000


In [87]:
fig = px.histogram(player_grouped_df, x='average_sentiment', title='Distribution of Average Sentiment')
fig.update_layout(width=800, height=400, xaxis_title='Average Sentiment', yaxis_title='Frequency')
fig.show()

In [88]:
top10_df = player_grouped_df.head(10)

fig = px.scatter(
    top10_df, 
    x='average_sentiment',
    y='count',
    text='name',  # This will show the names on hover, not as static text labels
    title='Scatter Plot of Average Sentiment vs. Count',
    hover_data=['name']  # Ensure the name is shown on hover
)

fig.update_traces(textposition='top center')
fig.update_layout(width=800, height=500, xaxis_title='Average Sentiment', yaxis_title='Count')
fig.show()

In [89]:
more_than_1_df = player_grouped_df[player_grouped_df['count'] > 1]

sorted_df = more_than_1_df.sort_values(by='average_sentiment', ascending=False)

top_10 = sorted_df.head(10)
bottom_10 = sorted_df.tail(10)

combined_df = pd.concat([top_10, bottom_10])
combined_df.head(20)

sorted_combined_df = combined_df.sort_values(by='average_sentiment')
sorted_combined_df['color'] = sorted_combined_df['average_sentiment'].apply(lambda x: 'lightcoral' if x < 0 else 'royalblue')

fig = px.bar(
    sorted_combined_df, 
    x='average_sentiment', 
    y='name', 
    title='Average Sentiment of Players with more than 1 Mention',
    color='color',
    color_discrete_map={'lightcoral': 'lightcoral', 'royalblue': 'royalblue'}
)

fig.update_layout(xaxis_title='Average Sentiment', yaxis_title='Name', width=800, height=600, showlegend=False)
fig.show()

#### 3. More analysis on a single player

- Deep dive into the sentiments of each player with frequent mentions on reddit.

In [22]:
crosby_df = df.query('mentioned_players == "Sidney Crosby"').copy()
crosby_df['created_date'] = pd.to_datetime(crosby_df['created_date'])
crosby_df = crosby_df.sort_values('created_date')

daily_avg_sentiment = crosby_df.groupby('created_date')['sentiment_result'].mean().reset_index()
daily_avg_sentiment['cumulative_moving_avg'] = daily_avg_sentiment['sentiment_result'].expanding().mean()

fig = px.line(
    daily_avg_sentiment,
    x='created_date',
    y='cumulative_moving_avg',
    title='Cumulative Average Sentiment for SIDNEY CROSBY Over Time',
)
fig.update_traces(mode='lines+markers')
fig.update_layout(xaxis_title='Date', yaxis_title='Average Sentiment', width=800, height=500)
fig.update_yaxes(range=[-1.1, 1.1])
fig.show()

In [21]:
matthews_df = df.query('mentioned_players == "Auston Matthews"').copy()
matthews_df['created_date'] = pd.to_datetime(matthews_df['created_date'])
matthews_df = matthews_df.sort_values('created_date')

daily_avg_sentiment = matthews_df.groupby('created_date')['sentiment_result'].mean().reset_index()
daily_avg_sentiment['cumulative_moving_avg'] = daily_avg_sentiment['sentiment_result'].expanding().mean()

fig = px.line(
    daily_avg_sentiment,
    x='created_date',
    y='cumulative_moving_avg',
    title='Cumulative Average Sentiment for AUSTON MATTHE Over Time'
)
fig.update_traces(mode='lines+markers')
fig.update_layout(xaxis_title='Date', yaxis_title='Average Sentiment', width=800, height=500)
fig.update_yaxes(range=[-1.1, 1.1])
fig.show()

In [20]:
cousins_df = df.query('mentioned_players == "Nick Cousins"').copy()
cousins_df['created_date'] = pd.to_datetime(cousins_df['created_date'])
cousins_df = cousins_df.sort_values('created_date')

daily_avg_sentiment = cousins_df.groupby('created_date')['sentiment_result'].mean().reset_index()
daily_avg_sentiment['cumulative_moving_avg'] = daily_avg_sentiment['sentiment_result'].expanding().mean()

fig = px.line(
    daily_avg_sentiment,
    x='created_date',
    y='cumulative_moving_avg',
    title='Cumulative Average Sentiment for NICK COUSINS Over Time'
)
fig.update_traces(mode='lines+markers')
fig.update_layout(xaxis_title='Date', yaxis_title='Average Sentiment', width=800, height=500)
fig.update_yaxes(range=[-1.1, 1.1])
fig.show()

In [24]:
rempe_df = df.query('mentioned_players == "Matt Rempe"').copy()
rempe_df['created_date'] = pd.to_datetime(rempe_df['created_date'])
rempe_df = rempe_df.sort_values('created_date')

daily_avg_sentiment = rempe_df.groupby('created_date')['sentiment_result'].mean().reset_index()
daily_avg_sentiment['cumulative_moving_avg'] = daily_avg_sentiment['sentiment_result'].expanding().mean()

fig = px.line(
    daily_avg_sentiment,
    x='created_date',
    y='cumulative_moving_avg',
    title='Cumulative Average Sentiment for MATT REMPE Over Time'
)
fig.update_traces(mode='lines+markers')
fig.update_layout(xaxis_title='Date', yaxis_title='Average Sentiment', width=800, height=500)
fig.update_yaxes(range=[-1.1, 1.1])
fig.show()