In [1]:
import sys
sys.path.append('..')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns

from sentiment import sentiment_vader
from imdb_api.imdbscraper import ImdbScraper

from scipy import stats

### Trailers

In [2]:
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../data/' + file + '.csv')
    df['network'] = file
    dfs.append(df)

cols = ['channelId', 'network', 'videoId', 'videoTitle', 'publishTime']
trailers = pd.concat(dfs)[cols]

### Comments and sentiment

In [4]:
file_list = ['hbo_comments', 'amazon_comments', 'netflix_comments', 'disney_comments']
dfs = []
for file in file_list:
    df = pd.read_csv('../comments/' + file + '.csv')
    dfs.append(df)

comments = pd.concat(dfs)
cols = ['videoId', 'commentId', 'textOriginal', 'likeCount', 'publishedAt']
comments = comments[cols]

### Mapping imdb and youtube

In [9]:
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../data/match/' + file + '_match.csv', delimiter=';')
    dfs.append(df)

match = pd.concat(dfs).dropna()

### IMDb

In [11]:
imdb = pd.read_csv('../imdb/title.basics.tsv', delimiter='\t')
ratings = pd.read_csv('../imdb/title.ratings.tsv', delimiter='\t')
imdb = imdb.merge(ratings, on='tconst')

  exec(code_obj, self.user_global_ns, self.user_ns)


### IMDb release dates

In [None]:
# scraper = ImdbScraper()
# scraper.scrape_dates(match.tconst, verbose=True, timeout=1)

In [None]:
# release_dates = pd.DataFrame(scraper.data)
# release_dates['dt'] = pd.to_datetime(release_dates.release_date_us)

In [12]:
release_dates = pd.read_csv('../data/release_dates/release_dates.csv')
cols = ['id', 'release_date_us']
release_dates = release_dates[cols]
release_dates = release_dates.rename(columns={'id':'tconst', 'release_date_us': 'releaseDateUS'})

### Annotations

In [14]:
annot = pd.read_csv('../comments/annotated.csv')
annot = annot[['commentId', 'sentiment', 'annotator']]

In [17]:
annot_agg = annot.groupby('commentId').agg(
    sentimentLabel=('sentiment', stats.mode)
).reset_index()

annot_agg.sentimentLabel = annot_agg.sentimentLabel.str[0].str[0]

sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
annot_agg['sentimentScore'] = annot_agg.sentimentLabel.map(sentiment_map)

### The big join

In [35]:
df = trailers.merge(comments, on='videoId')
df = df.merge(match, on='videoId')
df = df.merge(imdb, on='tconst')
df = df.merge(release_dates, on='tconst', how='left')
df = df.merge(annot_agg, on='commentId', how='left')

In [29]:
df.to_csv('dataset_no_sentiment.csv', index=False)

### Add sentiment

In [30]:
df['sentimentPredictedRaw'] = df.textOriginal.astype(str).apply(sentiment_vader)

In [33]:
sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df['sentimentPredictedScore'] = df.sentimentPredictedRaw.str[-1].map(sentiment_map)

In [38]:
df.to_csv('final_data.csv', index=False)