In [None]:
import sys
sys.path.append('../..')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import matplotlib as mpl
import seaborn as sns

from scipy import stats

import src.visualizor

In [None]:
df = pd.read_csv('../../data/processed/data.csv')

In [None]:
'''
X is the dataframe with only data from before a movie/show's release.
Then we groupby using the features 'videoID', 'tconst', and 'primaryTitle.'
This is done, in order to aggregate the desired values
--> We get the number of comments, which is acquired by counting the commentIds for each video
--> We get the mean sentiment for each comment
--> We get the mean imdb score for each movie/show


'''


X = df
X = X[X.commentDateOffset < 0]

agg = X.groupby(['videoId', 'tconst', 'primaryTitle']).agg(
    numComments=('commentId', 'count'),
    sentiment=('sentimentPredictedScore', 'mean'),
    averageRating=('averageRating', 'mean'),
).reset_index()

'''
Afterwards, we filter based on the amount of comments a movie/show has. In this case, 
we only take the movies with more than 100 comments.

'''
agg = agg[agg.numComments >= 100]

'''
We then set a threshold, used to split the data. The threshold is the median of the mean sentiment. 
agg_hi and agg_lo are then created using this threshold. 
'''

threshold = agg.sentiment.median()
mask = agg.sentiment <= threshold

agg_hi = agg[~mask]
agg_lo = agg[mask]

In [None]:
'''
Params for the histogram further down.
'''
hist_params = {
    'alpha': .4,
    'density': True
}

In [None]:
'''
Plotting the agg_hi and agg_lo in a histogram, using the param kwargs specified in the above cell.
'''

fig, ax = plt.subplots()
ax.hist(agg_hi.averageRating, **hist_params, label=f'Sentiment $>$ {round(threshold, 2)}')
ax.hist(agg_lo.averageRating, **hist_params, label=f'Sentiment $\leq$ {round(threshold, 2)}')
ax.set_title('Distribution of ratings for above/below\nmedian sentiments pre release')
ax.set_xlabel('IMDb rating')
ax.set_ylabel('Density')
ax.legend()

In [None]:
fig.savefig('../../reports/figs/stat_hist.svg')

In [None]:
'''
Calculate the T-test for the means of two independent samples of scores
We parse the function three arguments:
1) Sample 1 (agg_lo)
2) Sample 2 (agg_hi)
3) Alternative: Defines the alternative hypothesis. We used the option 'less', which states:
    "The mean of  the distribution underlying the first sample is less than the mean of the 
    distribution underlying the second sample."
'''

stats.ttest_ind(agg_lo.averageRating, agg_hi.averageRating, alternative='less')