In [2]:
import sys
sys.path.append('..')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns

from sentiment import sentiment_vader
from imdb_api.imdbscraper import ImdbScraper

from scipy import stats

### Trailers

In [3]:
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../data/' + file + '.csv')
    df['network'] = file
    dfs.append(df)

cols = ['channelId', 'network', 'videoId', 'videoTitle', 'publishTime']
trailers = pd.concat(dfs)[cols]

### Comments and sentiment

In [4]:
file_list = ['hbo_comments', 'amazon_comments', 'netflix_comments', 'disney_comments']
dfs = []
for file in file_list:
    df = pd.read_csv('../comments/' + file + '.csv')
    dfs.append(df)

comments = pd.concat(dfs)
cols = ['videoId', 'commentId', 'textOriginal', 'likeCount', 'publishedAt']
comments = comments[cols]

### Mapping imdb and youtube

In [5]:
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../data/match/' + file + '_match.csv', delimiter=';')
    dfs.append(df)

match = pd.concat(dfs).dropna()

### IMDb

In [6]:
imdb = pd.read_csv('../imdb/title.basics.tsv', delimiter='\t')
ratings = pd.read_csv('../imdb/title.ratings.tsv', delimiter='\t')
imdb = imdb.merge(ratings, on='tconst')

  exec(code_obj, self.user_global_ns, self.user_ns)


### IMDb release dates

In [8]:
# scraper = ImdbScraper()
# scraper.scrape_dates(match.tconst, verbose=True, timeout=1)



  soup = BeautifulSoup(response.text)


tt11609976
['26 November 2020']
tt11188206
['7 October 2021']
tt11540284
['24 November 2021']
tt11015214
['20 August 2020']
tt1924245
['17 September 2021']
tt1160419
['7 October 2021']
tt11188392
['9 July 2020']
tt13249596
['16 June 2022']
tt5034838
['31 March 2021']
tt10653784
['8 July 2021']
tt7658402
['29 November 2019']
tt16116174
['1 January 2022']
tt11210146
['3 December 2020']
tt11198330
['21 August 2022']
tt10846464
['19 November 2020']
tt15141288
['25 August 2021']
tt9140342
['18 February 2021']
tt8594276
['14 June 2019']
tt10975574
['31 March 2022']
tt14128670
['10 February 2022']
tt9620288
['2 September 2021']
tt13061914
['14 January 2021']
tt13622290
['24 November 2022']
tt7808566
['17 March 2021']
tt11947418
['17 March 2022']
tt12585076
['31 March 2022']
tt11525644
['18 June 2021']
tt11394650
['25 January 2020']
tt11000902
['3 March 2022']
tt13146488
['13 January 2022']
tt10569810
tt13075042
['28 July 2022']
tt9170108
['3 September 2020']
tt3272066
['17 August 2021']
tt227

tt21379574
['24 August 2022']
tt15494864
['27 January 2022']
tt14315756
['2 December 2021']
tt10795658
['10 December 2020']
tt20560404
['8 June 2022']
tt11126994
['6 November 2021']
tt19637852
['8 June 2022']
tt14278524
['16 September 2022']
tt21867596
['14 September 2022']
tt11464826
['26 January 2020']
tt21811526
['6 September 2022']
tt21839470
['7 September 2022']
tt22227040
['21 October 2022']
tt21031054
['9 December 2022']
tt14817272
['3 March 2022']
tt15083184
['31 August 2022']
tt11278476
['17 March 2022']
tt20449034
tt14589904
['15 July 2022']
tt14992922
['2 February 2022']
tt4729430
['8 November 2019']
tt1536537
['18 August 2017']
tt14664414
['1 July 2021']
tt9421570
['24 September 2021']
tt13056008
['29 September 2021']
tt22988228
['16 November 2022']
tt9196192
['23 January 2020']
tt11897478
['19 October 2022']
tt14715170
['10 November 2022']
tt14300912
['3 September 2021']
tt22771372
['10 November 2022']
tt10731768
['20 August 2021']
tt12312250
['27 May 2020']
tt7985576
['2 

In [10]:
# release_dates = pd.DataFrame(scraper.data)
# release_dates['releaseDateUS'] = pd.to_datetime(release_dates.release_date_us)

In [11]:
# release_dates.to_csv('release_dates.csv', index=False)

In [12]:
release_dates = pd.read_csv('../data/release_dates/release_dates.csv')
cols = ['id', 'release_date_us']
release_dates = release_dates[cols]
release_dates = release_dates.rename(columns={'id':'tconst'})

### Annotations

In [13]:
annot = pd.read_csv('../comments/annotated.csv')
annot = annot[['commentId', 'sentiment', 'annotator']]

In [14]:
annot_agg = annot.groupby('commentId').agg(
    sentimentLabel=('sentiment', stats.mode)
).reset_index()

annot_agg.sentimentLabel = annot_agg.sentimentLabel.str[0].str[0]

sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
annot_agg['sentimentScore'] = annot_agg.sentimentLabel.map(sentiment_map)

### The big join

In [15]:
df = trailers.merge(comments, on='videoId')
df = df.merge(match, on='videoId')
df = df.merge(imdb, on='tconst')
df = df.merge(release_dates, on='tconst', how='left')
df = df.merge(annot_agg, on='commentId', how='left')

In [16]:
df.to_csv('dataset_no_sentiment.csv', index=False)

### Add sentiment

In [None]:
df['sentimentPredictedRaw'] = df.textOriginal.astype(str).apply(sentiment_vader)

In [33]:
sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df['sentimentPredictedScore'] = df.sentimentPredictedRaw.str[-1].map(sentiment_map)

In [38]:
df.to_csv('final_data.csv', index=False)