# Analyzing 4 million movie coments 

In [None]:
import pandas as pd 
import numpy as np 

import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

# visualization and standard library packages
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from tqdm import tqdm
from collections import Counter
import os

%matplotlib inline

In [None]:
cols = ['index', 'comment_time', 'comment_id', 'movie_id', 
        'user_md5', 'rating', 'content', 'pred_label']
comments = pd.read_csv(f'{os.getcwd()}/comments_cleaned.csv', 
                       usecols=cols, 
                       dtype={'content': str}, 
                       index_col='index')
comments['comment_time'] = pd.to_datetime(comments['comment_time'])
comments = comments.set_index('comment_time')
comments_unsort = comments.copy()
comments = comments.sort_index()
comments

## Comments info.

Ratings are quality assessments

In [None]:
year_info = {'year':comments.index.year}
comments['year'] = year_info['year']

comments_count = comments['content'].groupby(comments.index.year).count().tolist()
comments_count_df = pd.DataFrame(comments['content'].groupby(comments.index.year).count(), columns=['content'])

In [None]:
def get_rating_by_year(comments, comments_count):
    
    cur = 0
    year = 2005
    dic = {}
    total_users_dic = {}
    for year_num in comments_count:
        rating_1 = 0
        rating_2 = 0
        rating_3 = 0
        rating_4 = 0
        rating_5 = 0
        rating_nan = 0
        
        # comments with ratings 
        score_lst = []
        users_dic = {}
        for x, md5 in tqdm(zip(comments.iloc[cur: cur + year_num]['rating'], 
                          comments.iloc[cur: cur + year_num]['user_md5'])):
            # users_info in one year
            if md5 not in users_dic:
                users_dic[md5] = 0
            users_dic[md5] += 1
            
            # all users_info
            if md5 not in total_users_dic:
                total_users_dic[md5] = 0
            total_users_dic[md5] += 1
            
            
            if x in set((1.0,)):
                rating_1 += 1
                score_lst.append(x)
            elif x in set((2.0,)):
                rating_2 += 1
                score_lst.append(x)
            elif x in set((3.0,)):
                rating_3 += 1
                score_lst.append(x)
            elif x in set((4.0,)):
                rating_4 += 1
                score_lst.append(x)
            elif x in set((5.0,)):
                rating_5 += 1
                score_lst.append(x)
            else:
                rating_nan += 1
        
        # cal proportion
        portion_1 = rating_1/year_num
        portion_2 = rating_2/year_num
        portion_3 = rating_3/year_num
        portion_4 = rating_4/year_num
        portion_5 = rating_5/year_num
        portion_nan = rating_nan/year_num
        
        # cal mean and std of ratings
        rating_mean = np.array(score_lst).mean()
        rating_std = np.array(score_lst).std()
        
        # user stats
        user_num = len(users_dic.values())
        user_avg_comments = np.array(list(users_dic.values())).mean()
        user_std_comments = np.array(list(users_dic.values())).std()
        
        dic[year] = [rating_1, rating_2, rating_3, rating_4, rating_5, rating_nan,
                    portion_1, portion_2, portion_3, portion_4, portion_5, portion_nan,
                    rating_mean, rating_std, user_num, user_avg_comments, user_std_comments]
        cur += year_num
        year += 1
    return dic, total_users_dic

In [None]:
dic, user_dic = get_rating_by_year(comments, comments_count)

In [None]:
df = pd.DataFrame(dic, index = ['rating=1.0','rating=2.0', 'rating=3.0', 'rating=4.0', 'rating=5.0', 'rating=NaN',
                               'portion_1.0','portion_2.0', 'portion_3.0', 'portion_4.0', 'portion_5.0', 'portion_NaN',
                               'rating_mean', 'rating_std', 'user_num', 'user_avg_comments', 'user_std_comments'])
x = df.columns

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, 
        df.loc['portion_1.0', :] + df.loc['portion_2.0', :],
        c='royalblue', 
        linewidth=3,
        alpha=0.5,
        label='rating=1.0 & rating=2.0')

ax.plot(x, 
        df.loc['portion_3.0', :],
        linewidth=3,
        alpha=0.5,
        color='olivedrab', 
        label='rating=3.0')

ax.plot(x, 
        df.loc['portion_4.0', :] + df.loc['portion_5.0', :],
        linewidth=3,
        alpha=0.5,
        color='red', 
        label='rating=4.0 & rating=5.0')

ax.plot(x, df.loc['portion_NaN', :],
        linewidth=3,
        alpha=0.5,
        color='k', label='rating=NaN')

plt.title("Rating Distribution",fontsize=25)
plt.ylabel('Proportion',fontsize=20)
plt.ylim([0, 0.6])
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(loc = 'upper right', fontsize=15)
plt.savefig('figs/rating_distribution_2005_2019.png', dpi=600)

In [None]:
comments_plt = comments.reset_index()
# default CI: 95%
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.lineplot(x="year",y="rating",
             data=comments_plt[comments_plt['rating'].isin([1.0, 2.0, 3.0, 4.0, 5.0])],
             ci=95)
ax.set_yticks([3.0, 3.2, 3.4, 3.6, 3.8, 4.0]);
plt.title("Mean rating(CI: 95%)", fontsize=25)
plt.ylabel('rating', fontsize=20)
plt.xlabel('year', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('figs/mean_rating_2005_2019.png', dpi=600)

In [None]:
# plot number of comments by year
fig = plt.figure()
fig.set_size_inches(12, 9)
ax1 = fig.add_subplot(111)
ax1.bar(x = comments_count_df.index, 
        height = comments_count_df['content'], 
        width = 0.8, 
        edgecolor = 'black', 
        linewidth = 2, 
        align = 'center', 
        color = 'royalblue',
        alpha=0.2,
        yerr = 0.5, 
        ecolor = 'r',
       label='comments')

ax2 = ax1.twinx()
ax2.plot(x, df.loc['user_num', :], label = 'users',linewidth=3, alpha=0.6)

plt.title("Number of comments and appeared users by year", fontsize=25)

ax1.set_xlabel('year', fontsize=20)
ax1.set_ylabel('Number of Comments', fontsize=20)
ax2.set_ylabel('Number of appeared users', fontsize=20)

ax1.set_xticklabels(list(range(2002,2022,2)), fontsize=20)
ax1.set_yticklabels(list(range(0,1000000,200000)), fontsize=20)
ax2.set_yticklabels(list(range(0,300000,50000)), fontsize=20)

h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1+h2, l1+l2, loc='upper left', fontsize=20)

plt.savefig('figs/num_comments_2005_2019.png', dpi=600)

### Movie rating distribution

In [None]:
movie_ids = set(comments['movie_id'])
f'number of movies: {len(movie_ids)}'

In [None]:
movies_df = pd.read_csv(f'{os.getcwd()}/movies.csv')
movies_df = movies_df.rename(columns={'MOVIE_ID':'movie_id',
                                    'YEAR':'year'})
movies_df.head()

In [None]:
movie_year_dic = {}
for idx, row in tqdm(movies_df.iterrows()):
  movie_year_dic[row['movie_id']] = row['year']

In [None]:
missing_movies = set()
movie_rating_year_dic = {}
for idx, row in tqdm(comments.iterrows()):
  if row['movie_id'] in movie_year_dic:
    if row['movie_id'] not in movie_rating_year_dic:
      movie_rating_year_dic[row['movie_id']] = [[], 0]
      movie_rating_year_dic[row['movie_id']][1] = movie_year_dic[row['movie_id']]
    movie_rating_year_dic[row['movie_id']][0].append(row['rating']) 
  else:
    missing_movies.add(row['movie_id'])
'miss {} movies'.format(len(missing_movies))

In [None]:
movie_idx = []
rating = []
year = []
for movie, v in movie_rating_year_dic.items():
  movie_idx.append(movie)
  rating.append(np.nanmean(v[0]))
  year.append(v[1])

movie_info = {
    'movie_id':movie_idx,
    'movie_rating':rating,
    'release_year': year
}
df_movie_info = pd.DataFrame(movie_info)
nan_rating_mask = df_movie_info['movie_rating'].isnull()

In [None]:
year_mask = (df_movie_info['release_year'] >= 2005) & (df_movie_info['release_year'] <= 2019)
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.lineplot(x='release_year', 
            y='movie_rating', 
            data=df_movie_info[~nan_rating_mask & year_mask],
            ci=95)
plt.title("Movie rating infomation (CI: 95%)",fontsize=25)
plt.ylabel('movie rating', fontsize=20)
plt.xlabel('release year',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('figs/movie_rating_info(2005-2019).png', dpi=600)

## Check the Ranking Algorithm

In this section, we check whether the ranking algorithm systematically prioritze highly positive or negative comments. 

In [None]:
movie_comm_rank_dic = {}
ranks = []
for _, row in tqdm(comments_unsort.iterrows()):
    if row['movie_id'] not in movie_comm_rank_dic:
        movie_comm_rank_dic[row['movie_id']] = 0
    movie_comm_rank_dic[row['movie_id']] += 1    
    ranks.append(movie_comm_rank_dic[row['movie_id']])
comments_unsort['rank'] = ranks

In [None]:
est = sm.OLS(endog=comments_unsort['pred_label'], 
             exog=sm.add_constant(comments_unsort['rank'])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
del comments_unsort

## Sentiment in the community

In [None]:
def get_sentiment_by_year(comments, comments_count):
    '''
    get sentiment trend by year. 
    '''
    cur = 0
    year = 2005
    dic = {}
    total_users_dic = {}
    for year_num in comments_count:
        pos = 0
        neu = 0
        neg = 0
        
        # comments with ratings 
        sentiment_lst = []
        users_dic = {}
        for sentiment, md5 in tqdm(zip(comments.iloc[cur: cur + year_num]['pred_label'], 
                          comments.iloc[cur: cur + year_num]['user_md5'])):
            # users_info in one year
            if md5 not in users_dic:
                users_dic[md5] = 0
            users_dic[md5] += 1
            
            # all users_info
            if md5 not in total_users_dic:
                total_users_dic[md5] = 0
            total_users_dic[md5] += 1
            
            
            if sentiment == 1:
                pos += 1
                sentiment_lst.append(sentiment)
            elif sentiment == 0:
                neu += 1
                sentiment_lst.append(sentiment)
            else:
                neg += 1
                sentiment_lst.append(sentiment)
        
        # cal proportion
        pos_portion = pos/year_num
        neu_portion = neu/year_num
        neg_portion = neg/year_num
        
        # cal mean and std of sentiment 
        sentiment_mean = np.array(sentiment_lst).mean()
        sentiment_std = np.array(sentiment_lst).std()
        
        
        dic[year] = [pos, neu, neg, 
                    pos_portion, neu_portion, neg_portion,
                    sentiment_mean, sentiment_std]
        cur += year_num
        year += 1
    return dic, total_users_dic

In [None]:
dic, user_dic = get_sentiment_by_year(comments, comments_count)

In [None]:
df = pd.DataFrame(dic, index = ['positive','neutral', 'negative',
                               'positive_portion','neutral_portion', 'negative_portion',
                               'sentiment_mean', 'sentiment_std'])
x = df.columns

fig, ax = plt.subplots(figsize=(12,8))

ax.plot(x, 
        df.loc['negative_portion', :],
        c='royalblue', 
        linewidth=3,
        alpha=0.5,
        label='negative')

ax.plot(x, 
        df.loc['neutral_portion', :],
        c='olivedrab', 
        linewidth=3,
        alpha=0.5,
        label='neutral')

ax.plot(x, 
        df.loc['positive_portion', :],
        linewidth=3,
        alpha=0.5,
        color='red', label='positive')

plt.title("Sentiment Distribution",fontsize=25)
plt.ylabel('Proportion',fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.ylim([0.05, 0.65])
plt.legend(loc = 'upper right', fontsize=20)
plt.savefig('figs/sentiment_distribution_2005_2019.png', dpi=600)

In [None]:
comments_plt = comments.reset_index()
# default CI: 95%
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.lineplot(x="year",y="pred_label",
             data=comments_plt,
             ci=95)

plt.title("Mean sentiment(CI: 95%)",fontsize=25)
plt.ylabel('mean sentiment', fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('figs/mean_sentiment_2005_2019.png', dpi=600)
del comments_plt

### Comparsion between popular and less popular movies

In [None]:
movie_dic = {}
for idx, row in tqdm(comments.iterrows()):
    if row['movie_id'] not in movie_dic:
        movie_dic[row['movie_id']] = 0
    movie_dic[row['movie_id']] += 1

In [None]:
popular_movies = 0
comments_from_popular = 0
for v in tqdm(movie_dic.values()):
    if v >=220:
        popular_movies +=1
        comments_from_popular += v

In [None]:
movie_popularity = []
for idx, row in tqdm(comments.iterrows()):
    if movie_dic[row['movie_id']] >= 220:
        movie_popularity.append('popular')
    else:
        movie_popularity.append('less popular')

In [None]:
comments_movie = pd.concat([comments, 
                            pd.DataFrame({'popularity':movie_popularity},
                                         index=comments.index)], axis=1)
comments_movie_plt = comments_movie.reset_index()

In [None]:
# default CI: 95%
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.lineplot(x="year",y="pred_label",
             data=comments_movie_plt,
             hue="popularity",
             hue_order=['popular', 'less popular'],
             ci=95)
plt.title("Popular moives vs. Less popular movies(CI: 95%)", fontsize=25)
plt.ylabel('mean sentiment',fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.savefig('figs/mean_sentiment_comparison_2005_2019.png', dpi=600)

In [None]:
del comments_movie_plt

## Add movie info to comments 

In [None]:
movies_df = movies_df.set_index('movie_id')

In [None]:
main_region = []
main_genre = []
error_movie = []
for  idx, movie in tqdm(df_movie_info.iterrows()):
    movie_info = movies_df.loc[movie['movie_id']]
    try:
        region = movie_info['REGIONS'].split('/')[0].strip()
        genre = movie_info['GENRES'].split('/')[0].strip()
    except AttributeError:
        region = 'NaN'
        genre = 'NaN'
        error_movie.append(idx)
    main_region.append(region)
    main_genre.append(genre)    

In [None]:
df_movie_info['genre'] = main_genre
df_movie_info['region'] = main_region

In [None]:
df_movie_info.drop(error_movie, axis=0, inplace=True)
df_movie_info = df_movie_info.reset_index(drop=True)

In [None]:
CATEGORICAL_COLS = ['genre', 'region']
df_movie_info[CATEGORICAL_COLS] = df_movie_info[CATEGORICAL_COLS].apply(lambda x: x.astype('category'))
df_movie_info

In [None]:
df_movie_info = df_movie_info.set_index('movie_id')
df_movie_info

In [None]:
comments = comments.reset_index()

In [None]:
table = []
idx_list = []
for idx, comment in tqdm(comments.iterrows()):
    try:
        info  =  df_movie_info.loc[comment['movie_id']]
        table.append([info['movie_rating'], info['release_year'],
                      info['genre'], info['region']])
    except KeyError:
        idx_list.append(idx)
        table.append(['NaN', 'NaN', 'NaN', 'NaN'])

In [None]:
comments[['movie_rating', 'movie_release_year', 'genre', 'region']] = table
comments.drop(idx_list, axis=0, inplace=True)

## Community sentiment

Sentiment Change overtime controlling movies 

In [None]:
comments[['year','movie_rating', 'movie_release_year']] = comments[['year','movie_rating', 'movie_release_year']].apply(lambda x: x.astype('float64'))
comments[['genre', 'region']] = comments[['genre', 'region']].apply(lambda x: x.astype('category'))

In [None]:
indices = comments[np.isnan([comments['movie_rating']])[0]].index
comments.drop(indices, axis=0, inplace=True)
comments.reset_index(drop=True)

In [None]:
comments['comment_time'] = pd.to_datetime(comments['comment_time'])
comments = comments.set_index('comment_time').sort_index()
comments['week'] = comments.index.weekofyear

In [None]:
week_by_year_idx = []
idx = 1
year = 2005
week = 23
for _, row in tqdm(comments.iterrows()):
    if row['year'] != year:
        year = row['year']
        week = row['week']
        idx += 1
    if week != row['week']:
        idx += 1
        week = row['week']
    week_by_year_idx.append(idx)
comments['commented_week'] = week_by_year_idx

In [None]:
comm_encoded = pd.get_dummies(comments, columns=['genre', 'region'])
comm_encoded

In [None]:
reg_cols = list(comm_encoded.columns[6:9]) + list(comm_encoded.columns[11:])

In [None]:
comm_encoded_sample = comm_encoded.sample(frac=0.1, random_state=42)
year_mask_1 = (comm_encoded_sample['year'] <=2015)
est = sm.OLS(endog=comm_encoded_sample[year_mask_1]['pred_label'], 
             exog=sm.add_constant(comm_encoded_sample[year_mask_1][reg_cols])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
year_mask_1 = (comments['year'] <=2015)
# year_mask_2 = (comments['year'] >2015)
est = sm.OLS(endog=comments[year_mask_1]['pred_label'], 
             exog=sm.add_constant(comments[year_mask_1][['year',
                                                         'movie_rating', 
                                                         'movie_release_year']])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
year_mask_1 = (comments['year'] <=2015)
# year_mask_2 = (comments['year'] >2015)
est = sm.OLS(endog=comments[year_mask_1]['pred_label'], 
             exog=sm.add_constant(comments[year_mask_1][['year']])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

## User sentiment

In [None]:
"95 percenitle user's number of comments is %.1f"% np.percentile(list(user_dic.values()), 95)

In [None]:
threshold_comments = int(np.percentile(list(user_dic.values()), 95))
is_active = []
for idx, row in tqdm(comments.iterrows()):
    if user_dic[row['user_md5']] >= threshold_comments: 
        is_active.append('active')
    elif user_dic[row['user_md5']] == 1:
        is_active.append('least active')
    else:
        is_active.append('less active')

In [None]:
comments.reset_index(inplace=True)
user_info = pd.DataFrame({'active_state':is_active}, index=comments.index)
comments_user = pd.concat([comments, user_info], axis=1)
comments_user[['year']] = comments_user[['year']].apply(lambda x: x.astype('int64'))

In [None]:
# default CI: 95%
fig, ax = plt.subplots()
fig.set_size_inches(16, 12)
sns.barplot(x="year",y="pred_label",
            data=comments_user,
            hue="active_state",
            hue_order=['active', 'less active', 'least active'],
            ci=95,
            capsize=.1,
            palette="Set2")
plt.title("sentiment disparity between users(CI: 95%)", fontsize=25)
plt.ylabel('mean sentiment',fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(rotation=60,fontsize=20);
plt.yticks(fontsize=20);
plt.legend(loc = 'upper right',fontsize=20);
plt.savefig('figs/sentiment_disparity_2005_2019.png', dpi=600)

### Sentiment in user life cycle

In [None]:
# note that this dataframe has been sorted in chronological order. 
time_period = []
user_appearance_dic = {}

for _, row in tqdm(comments.iterrows()):
    if row['user_md5'] not in user_appearance_dic:
        user_appearance_dic[row['user_md5']] = 0
    user_appearance_dic[row['user_md5']] += 1
    
    if user_appearance_dic[row['user_md5']] <= int(user_dic[row['user_md5']]/2):
        time_period.append('first stage')
    else:
        time_period.append('second stage')
comments_user['stage'] = time_period

#### Sentiment change by year

In [None]:
user_mask =(comments_user['active_state'] == 'active')
# default CI: 95%
fig, ax = plt.subplots()
fig.set_size_inches(16, 12)
sns.pointplot(x="year",y="pred_label",
            data=comments_user[user_mask],
            ci=95,
            hue="stage",
            hue_order=['first stage', 'second stage'],
            capsize=.1,
            palette="Set2")
plt.title("Sentiment disparity of active users in different stages(CI: 95%)",fontsize=25)
plt.ylabel('sentiment',fontsize=20)
plt.xlabel('year',fontsize=20)

plt.xticks(rotation=30, fontsize=20);
plt.yticks(fontsize=20);
plt.legend(loc = 'upper right',fontsize=20);
plt.savefig('figs/sentiment_user_difference_stage_change_2005_2019.png', dpi=600)

In [None]:
comments_user.set_index('comment_time', inplace=True)

In [None]:
fisrt_stage_mask = (comments_user['active_state'] == 'active') & (comments_user['stage'] == 'first stage')
second_stage_mask = (comments_user['active_state'] == 'active') & (comments_user['stage'] == 'second stage')
comments_user_count_first = comments_user[fisrt_stage_mask]['content'].groupby(comments_user[fisrt_stage_mask].index.year).count().tolist()                     
dic_user_first, _ = get_sentiment_by_year(comments_user[fisrt_stage_mask], comments_user_count_first)
df_user_first = pd.DataFrame(dic_user_first, index = ['positive','neutral', 'negative',
                               'positive_portion','neutral_portion', 'negative_portion',
                               'sentiment_mean', 'sentiment_std'])
x = df_user_first.columns

In [None]:
comments_user_count_second = comments_user[second_stage_mask]['content'].groupby(comments_user[second_stage_mask].index.year).count().tolist()                     
dic_user_second, _ = get_sentiment_by_year(comments_user[second_stage_mask], comments_user_count_second)
df_user_second = pd.DataFrame(dic_user_second, index = ['positive','neutral', 'negative',
                               'positive_portion','neutral_portion', 'negative_portion',
                               'sentiment_mean', 'sentiment_std'])

In [None]:
fig, axes = plt.subplots(3)
plt.subplots_adjust(hspace=0.25)
fig.set_size_inches(8, 12)

axes[0].plot(x, df_user_first.loc['negative_portion', :],
        color='purple', label='first stage', marker='*', alpha=0.3)
axes[0].plot(x, df_user_second.loc['negative_portion', :],
        color='purple', label='second stage', marker='.')
axes[0].set_title("Proportion of negative sentiment", fontsize=15)
axes[0].set_ylabel('Proportion',fontsize=15)
axes[0].set_xticks(range(2006, 2020, 2))
axes[0].tick_params(labelsize=15)
axes[0].legend(fontsize=12)

axes[1].plot(x, df_user_first.loc['neutral_portion', :],
        color='darkgreen', label='first stage', marker='*', alpha=0.3)
axes[1].plot(x, df_user_second.loc['neutral_portion', :],
        color='darkgreen', label='second stage', marker='.')
axes[1].set_title("Proportion of neutral sentiment", fontsize=15)
axes[1].set_ylabel('Proportion', fontsize=15)
axes[1].set_xticks(range(2006, 2020, 2))
axes[1].tick_params(labelsize=15)
axes[1].legend(fontsize=12)

axes[2].plot(x, df_user_first.loc['positive_portion', :],
        color='red', label='first stage', marker='*', alpha=0.3)
axes[2].plot(x, df_user_second.loc['positive_portion', :],
        color='red', label='second stage', marker='.')
axes[2].set_title("Proportion of positive sentiment", fontsize=15)
axes[2].set_ylabel('Proportion', fontsize=15)
axes[2].set_xticks(range(2006, 2020, 2))
axes[2].tick_params(labelsize=15)
axes[2].legend(fontsize=12)

plt.savefig('figs/sentiment_proprotion_between_stages_aggregate.png', dpi=600)

In [None]:
# backup data
# we don't use to_csv() here is because simply use to_csv() would yield 
# wrongly formatted csv files.
def write_csv(example_df, file_dir):
  '''
  write example DataFrame into the given file directory 
  '''
  with open(file_dir, 'w') as f:
    writer = csv.writer(f, delimiter=',')
    # write column names 
    writer.writerow(('index',
                     'comment_time',
                     'comment_id',
                     'movie_id',
                     'user_md5',
                     'rating',
                     'content',
                     'pred_label',
                     'year',
                     'movie_rating',
                     'movie_release_year',
                     'movie_genre',
                     'movie_region',
                     'week',
                     'commented_week'
                     ))
    for idx, row in tqdm(example_df.iterrows()):
            writer.writerow((idx,
                             row['comment_time'],
                             row['comment_id'],
                             row['movie_id'],
                             row['user_md5'],
                             row['rating'],
                             row['content'],
                             row['pred_label'],
                             row['year'],
                             row['movie_rating'],
                             row['movie_release_year'],
                             row['movie_genre'],
                             row['movie_region'],
                             row['week'],
                             row['commented_week']
                             ))
            
comments = comments.reset_index()
file_dir = 'comments_annotated.csv'
write_csv(comments, file_dir)

### Sentiment polarization of all users

The paper from Nature scientific report also defined a metric to identity sentiment polarization. 

$$ 
  \rho_{\sigma}(i) = \frac{(N_i -2k_i - h_i)(N_i - h_i)}{N_i^2}
$$


**OLS**

In [None]:
user_polar_all = {}
for _, row in tqdm(comments_user.iterrows()):
    if row['user_md5'] not in user_polar_all:
        # all counts, positive, neutral and negative
        user_polar_all[row['user_md5']] = [user_dic[row['user_md5']], 0, 0, 0]
    
    if row['pred_label'] == 1:
        user_polar_all[row['user_md5']][1] += 1 
    elif row['pred_label'] == 0:
        user_polar_all[row['user_md5']][2] += 1
    else:
        user_polar_all[row['user_md5']][3] += 1  

In [None]:
users = []
total = []
polar_scores = []

for user, item in user_polar_all.items():
    users.append(user)
    total.append(item[0])
    polar_score = (item[0] - 2*item[3] - item[2])*(item[0] - item[2])/(item[0] ** 2)
    polar_scores.append(polar_score)
    
dic_for_reg_polar_all = {'user_md5':users,
                         'number_of_comments':total,
                         'user_sentiment_polarization':polar_scores}
df_user_polar_all = pd.DataFrame(dic_for_reg_polar_all)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(16, 12)
sns.regplot(x='number_of_comments', 
            logx=True,
            y='user_sentiment_polarization', 
            data=df_user_polar_all,
            ci=95,
            fit_reg=True,
            scatter=True,
            label=True,
            line_kws={'color':'red',
                     'alpha':0.5},
            marker=">")
plt.title("Sentiment polarization as a function of Comments (CI: 95%)",fontsize=25)
plt.xscale('log')

plt.ylabel('user sentiment polarization',fontsize=20)
plt.xlabel('number of comments',fontsize=20)
plt.xticks(fontsize=20);
plt.yticks(fontsize=20);

plt.savefig('figs/regression_of_sentiment_polarization_all.png', dpi=600)

In [None]:
X = np.log(np.array(df_user_polar_all['number_of_comments'])).reshape(-1,1)
df_user_polar_all['log comments'] = X
est = sm.OLS(endog=df_user_polar_all['user_sentiment_polarization'], exog=sm.add_constant(df_user_polar_all['log comments'])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
# backup data
# we don't use to_csv() here is because simply use to_csv() would yield 
# wrongly formatted csv files.
def write_csv(example_df, file_dir):
  '''
  write example DataFrame into the given file directory 
  '''
  with open(file_dir, 'w') as f:
    writer = csv.writer(f, delimiter=',')
    # write column names 
    writer.writerow(('index',
                     'user_md5',
                     'number_of_comments',
                     'user_sentiment_polarization'
                     ))
    for idx, row in tqdm(example_df.iterrows()):
            writer.writerow((idx,
                             row['user_md5'],
                             row['number_of_comments'],
                             row['user_sentiment_polarization']
                             ))
            
file_dir = 'user_polar_info.csv'
write_csv(df_user_polar_all, file_dir)

**OLS multiple variables**

In [None]:
user_movie_ratings = {}
enter_time = {}
for idx, cmt in tqdm(comments.iterrows()):
    if cmt['user_md5'] not in user_movie_ratings:
        user_movie_ratings[cmt['user_md5']] = []
        enter_time[cmt['user_md5']] = cmt['commented_week']
    user_movie_ratings[cmt['user_md5']].append(cmt['movie_rating'])
user_avg_movie_rating = {}
for user, movie_ratings in user_movie_ratings.items():
    user_avg_movie_rating[user] = np.mean(movie_ratings)

In [None]:
user_movie_rating = []
user_enter_time = []
for idx, user in tqdm(df_user_polar_all.iterrows()):
    user_movie_rating.append(user_avg_movie_rating[user['user_md5']])
    user_enter_time.append(enter_time[user['user_md5']])

In [None]:
df_user_polar_all['enter_time'] = user_enter_time
df_user_polar_all['avg_movie_rating'] = user_movie_rating

In [None]:
X = np.log(np.array(df_user_polar_all['number_of_comments'])).reshape(-1,1)
df_user_polar_all['log comments'] = X
est = sm.OLS(endog=df_user_polar_all['user_sentiment_polarization'], 
             exog=sm.add_constant(df_user_polar_all[['log comments',
                                                     'avg_movie_rating']])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
X = np.log(np.array(df_user_polar_all['number_of_comments'])).reshape(-1,1)
df_user_polar_all['log comments'] = X
est = sm.OLS(endog=df_user_polar_all['user_sentiment_polarization'], 
             exog=sm.add_constant(df_user_polar_all[['log comments',
                                                     'enter_time',
                                                     'avg_movie_rating']])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

We didn't model movie genre, region, release year into regression as previous results showed that they typically don't have significant influcence on comment sentiments, though it would help improve $R^2$

**User watch sequence**

In [None]:
user_movie_sequence = {}
for _, row in tqdm(comments_user.iterrows()):
    if row['user_md5'] not in user_movie_sequence:
        user_movie_sequence[row['user_md5']] = {}

    user_movie_sequence[row['user_md5']][row['movie_id']]=len(user_movie_sequence[row['user_md5']]) + 1 

In [None]:
watch_sequence = []
movie_rating_lst = []

for _, row in tqdm(comments_user.iterrows()):
  sequence = user_movie_sequence[row['user_md5']][row['movie_id']]
  watch_sequence.append(sequence)
  movie_rating_lst.append(df_movie_info.loc[row['movie_id']]['movie_rating'])

In [None]:
dic_for_watch_sequence_rating = {'watch_sequence':watch_sequence,
                                 'movie_rating':movie_rating_lst}
df_watch_sequence_rating = pd.DataFrame(dic_for_watch_sequence_rating)
nan_rating_mask = df_watch_sequence_rating['movie_rating'].isnull()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.regplot(x='watch_sequence', 
            logx=True,
            y='movie_rating', 
            data=df_watch_sequence_rating[~nan_rating_mask],
            ci=95,
            fit_reg=True,
            scatter=False,
            label=True,
            line_kws={'color':'red',
                     'alpha':0.5},
            marker=">")
plt.title("Movie rating ~ watching sequence (CI: 95%)",fontsize=25)
plt.xscale('log')
ax.set_yticks([3.1, 3.2, 3.3, 3.4, 3.5]);

plt.ylabel('movie rating',fontsize=20)
plt.xlabel('watching sequence',fontsize=20)
plt.xticks(fontsize=20);
plt.yticks(fontsize=20);
# plt.legend(loc='upper right',fontsize=20)

plt.savefig('figs/regression_of_movie_rating_watch_sequence_no_scatter.png', dpi=600)

## For the reverse after 2015

### mean sentiment vs. proportion of comments from new users by week

In [None]:
comments['comment_time'] = pd.to_datetime(comments['comment_time'])
comments = comments.set_index('comment_time')

In [None]:
sentiment_mean = comments['pred_label'].resample('W').mean()
comments_num = comments['pred_label'].resample('W').count()
sent_comm = pd.DataFrame({'sentiment_mean':sentiment_mean, 'comments_num': comments_num})
new_user_by_week = []
comments_from_new_user_by_week = []
comments_from_old_user_by_week = []
user_set = set()

for time, item in tqdm(comments.resample('W')):
  new_user = set()
  comments_from_new_user = 0
  comments_from_old_user = 0
  # comments in each month
  for idx, row in item.iterrows():
    if row['user_md5'] not in user_set:
      new_user.add(row['user_md5'])
      comments_from_new_user += 1
    else:
      comments_from_old_user += 1
  new_user_by_week.append(len(new_user))
  comments_from_new_user_by_week.append(comments_from_new_user)
  comments_from_old_user_by_week.append(comments_from_old_user)
  user_set = user_set.union(new_user)

In [None]:
user_comments_week_df = pd.DataFrame({'new users': new_user_by_week,
                                       'new comments': comments_from_new_user_by_week,
                                       'old comments': comments_from_old_user_by_week})
new_comments_portion =\
    user_comments_week_df['new comments']/(user_comments_week_df['new comments'] +\
                                           user_comments_week_df['old comments'])

sent_comm['comments_from_new_users_proportion'] = new_comments_portion.values

In [None]:
sent_comm

In [None]:
comm_mask_new = (sent_comm['comments_from_new_users_proportion'] <=0.4)
# for clarity
sent_mask = (sent_comm['sentiment_mean'] <=0.5) & (sent_comm['sentiment_mean'] >= 0) 

fig, ax = plt.subplots()
fig.set_size_inches(16, 12)
sns.regplot(x='comments_from_new_users_proportion', 
            y='sentiment_mean', 
            data=sent_comm[comm_mask_new & sent_mask],
            ci=95,
            fit_reg=True,
            scatter=True,
            label=True,
            robust=True,
            line_kws={'color':'red',
                     'alpha':0.5},
            marker=">")
plt.title("Mean Sentiment vs. proportion of comments from new users (CI: 95%)",fontsize=25)
plt.ylabel('mean sentiment by week',fontsize=20)
plt.xlabel('proportion of comments from new users by week',fontsize=20)
plt.xticks(fontsize=20);
plt.yticks(fontsize=20);
plt.savefig('figs/regression_of_proportion_of_new_comments_mean_sentiment.png', dpi=600)

In [None]:
est = sm.OLS(endog=sent_comm[comm_mask_new]['sentiment_mean'], 
             exog=sm.add_constant(sent_comm[comm_mask_new]['comments_from_new_users_proportion'])).fit()

stargazer = Stargazer([est])
stargazer.show_model_numbers(False)
stargazer.significant_digits(3)
stargazer.show_confidence_intervals(True)
stargazer.show_degrees_of_freedom(False)
print(stargazer.render_latex())

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(sent_comm.index[479:], sent_comm.iloc[479:]['comments_from_new_users_proportion'].rolling(4).mean(), 
        linewidth=3, 
        alpha=0.6)
plt.title("Proportion of comments from new users", fontsize=25)
plt.ylabel('Proportion',fontsize=25)
plt.xlabel('# by week',fontsize=25)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.savefig('figs/proportion_of_new_comments_2014_2019.png', dpi=600) 