# This notebook merges all the different data together, that we have acquried.

In [2]:
import sys
sys.path.append('../../')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns

from src.sentiment import sentiment_vader
from src.imdb_api.imdbscraper import ImdbScraper

from scipy import stats

### Trailers

In [3]:
'''
We loop through our file list, which consist of the 4 networks. 
The column 'network' is created in the dataframe, and the given network is added to said column.
At the end of each loop, we add the data to the list 'dfs'

The dataframe 'trailers' is then created, using 'dfs' and the columns defined in 'cols'

'''
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../../data/interim/trailers/' + file + '.csv')
    df['network'] = file
    dfs.append(df)

cols = ['channelId', 'network', 'videoId', 'videoTitle', 'publishTime']
trailers = pd.concat(dfs)[cols]

### Comments

In [17]:
'''
This cell follows the structure as the trailer cell above.
'''

file_list = ['hbo_comments', 'amazon_comments', 'netflix_comments', 'disney_comments']
dfs = []
for file in file_list:
    df = pd.read_csv('../../data/raw/comments/' + file + '.csv')
    dfs.append(df)

comments = pd.concat(dfs)
cols = ['videoId', 'commentId', 'textOriginal', 'likeCount', 'publishedAt']
comments = comments[cols]

### Mapping imdb and youtube

In [20]:
'''
This cell follows the structure as the trailer & comments cells above.
'''
file_list = ['hbo', 'amazon', 'netflix', 'disney']
dfs = []
for file in file_list:
    df = pd.read_csv('../../data/interim/match/' + file + '_match.csv', delimiter=';')
    dfs.append(df)

match = pd.concat(dfs).dropna()

### IMDb

In [35]:
'''
The data used here, is from an external source: datasets downloaded from the IMDb webiste. https://www.imdb.com/interfaces/
We get the movie/show titles from 'title.basics.tsv' and the ratings from 'title.ratings.tsv'. 
--> They are joined on 'tconst' (IMDb's ID for a show/movie)
'''

imdb = pd.read_csv('../../data/external/imdb/title.basics.tsv', delimiter='\t')
ratings = pd.read_csv('../../data/external/imdb/title.ratings.tsv', delimiter='\t')
imdb = imdb.merge(ratings, on='tconst')

  exec(code_obj, self.user_global_ns, self.user_ns)


### IMDb release dates

In [None]:
'''
Scraper: Initializes the ImdbScraper() class. 
It scrapes the release dates, based on on 'tconst' it is given.
A timeout of 1 second is implemented, to not get locked out from their website. 
'''

# scraper = ImdbScraper()
# scraper.scrape_dates_alternate(match.tconst, verbose=True, timeout=1)

In [None]:
'''
Creating a dataframe from the sraped data.
The column 'releaseDateUS' is created from the scraped data and converted to a datetime object.
'''

# release_dates = pd.DataFrame(scraper.data)
# release_dates['releaseDateUS'] = pd.to_datetime(release_dates.release_date_us)

In [None]:
# release_dates.to_csv('release_dates.csv', index=False)

In [36]:
release_dates = pd.read_csv('../../data/interim/release_dates.csv')
cols = ['id', 'release_date_us']
release_dates = release_dates[cols]
release_dates = release_dates.rename(columns={'id':'tconst', 'release_date_us': 'releaseDateUS'})

### Annotations

In [7]:
# load and prepare annotations
annot = pd.read_csv('../../data/interim/annotated.csv')
annot = annot[['commentId', 'sentiment', 'annotator']]

In [38]:
annot_agg = annot.groupby('commentId').agg(
    sentimentLabel=('sentiment', stats.mode)
).reset_index()

annot_agg.sentimentLabel = annot_agg.sentimentLabel.str[0].str[0]

sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
annot_agg['sentimentScore'] = annot_agg.sentimentLabel.map(sentiment_map)

### ReturnYoutTubeDislikes

In [39]:
# load and prepare Return YouTube Dislikes
ryd = pd.read_csv('../../data/raw/returnyoutubedislikes.csv')
ryd = ryd[['videoId', 'likes', 'dislikes', 'viewCount']]

### The big join

In [40]:
# join all dataframes together
df = trailers.merge(comments, on='videoId')
df = df.merge(ryd, on='videoId', how='left')
df = df.merge(match, on='videoId')
df = df.merge(imdb, on='tconst')
df = df.merge(release_dates, on='tconst', how='left')
df = df.merge(annot_agg, on='commentId', how='left')

### Calculate comment date offset

In [47]:
 # add an comment date offset column
dt = pd.to_datetime(df.publishedAt)
comment_date = dt.dt.date

In [48]:
df['releaseDateUS'] = pd.to_datetime(df.releaseDateUS)

In [49]:
df['commentDateOffset'] = (pd.to_datetime(comment_date) - df.releaseDateUS)
df['commentDateOffset'] = df.commentDateOffset.astype('timedelta64[D]').astype('float')

In [52]:
df.to_csv('../../data/processed/dataset_no_sentiment.csv', index=False)

### Add sentiment

In [None]:
df['sentimentPredictedRaw'] = df.textOriginal.astype(str).apply(sentiment_vader)

In [None]:
sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df['sentimentPredictedScore'] = df.sentimentPredictedRaw.str[-1].map(sentiment_map)

In [None]:
df.to_csv('final_data.csv', index=False)