In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Clean YouTube video data

In [None]:
'''
Read the data
'''
hbo = pd.read_csv('../../data/raw/trailers/hbo_trailers.csv')
netflix = pd.read_csv('../../data/raw/trailers/netflix_trailers.csv')
disney = pd.read_csv('../../data/raw/trailers/disney_trailers.csv')
amazon = pd.read_csv('../../data/raw/trailers/amazon_trailers.csv')

In [None]:
'''
We clean all the scraped Youtube trailers:

1) lowercase all the video title (str)
2) marking if a video is trailer or not, based on if it contains the word 'trailer' in the title (bool)
3) Creating 6 masks to apply to the data - done in order to only get offical trailers. A mask where:
    3.1) 'is_trailer' is true
    3.2) title contains the string 'official'
    3.3) title does not contain the word 'season' followed by a number bigger than 1
    3.4) title dot not contain the word 'teaser'
    3.5) title does not contain the word 'trailer' followed by a number bigger than 1
    3.6) A final mask, that combines the 5 previous masks.
4) applying the final mask to the data, and saving it in new .csv files.
'''

for name, df in zip(['hbo', 'netflix', 'disney', 'amazon'], [hbo, netflix, disney, amazon]):
    df['title_lowered'] = df.videoTitle.str.lower()
    df['is_trailer'] = df.title_lowered.str.contains('trailer')
    trailer_mask = df.is_trailer == True
    official_mask = df.title_lowered.str.contains('official')
    season_mask = ~df.title_lowered.str.contains('season\s([2-9]|\d{2})')
    teaser_mask = ~df.title_lowered.str.contains('teaser')
    numbered_trailer_mask = ~df.title_lowered.str.contains('trailer.*[2-9]')
    mask = trailer_mask & official_mask & season_mask & teaser_mask & numbered_trailer_mask
    df[mask][list(df)[1:]].drop_duplicates().to_csv(f'../../data/interim/trailers/{name}_trailers.csv', index=False)