In [1]:
import pandas as pd
import re

In [2]:
netflix_data = pd.read_csv('../data/raw/netflix_branded_titles_with_release_year.csv')

In [3]:
#split combined_title into title and season
netflix_data.rename(columns={'title_desc':'combined_title'}, inplace=True)
netflix_data[['title', 'season']] = netflix_data['combined_title'].str.split(': Season ', expand=True)
netflix_data['title'] = netflix_data['title'].str.lower()
netflix_data['is_tv_show'] = netflix_data['season'].notnull()
netflix_data['release_year'] = netflix_data['release_year'].astype('int')

In [12]:
#read imdb movie data
imdb_movie_data = pd.read_csv('../data/processed/imdb_movies.csv')

In [148]:
#read matched data
matched_data = pd.read_csv('../data/raw/matched_wIMDB.tsv', sep='\t')[['tconst', 'title', 'release_year', 'season']]

In [15]:
imdb_clean_movie = imdb_movie_data[['tconst', 'title', 'startYear']].rename(columns={'startYear':'release_year'})

In [16]:
imdb_clean2 = pd.concat([imdb_clean_movie, matched_data], axis=0)

In [45]:
movie_merge = netflix_data[~netflix_data['is_tv_show']].merge(imdb_clean2, on=['title', 'release_year'], how='left', indicator=True)

In [65]:
good_batch_1 = movie_merge[(movie_merge['_merge']=='both') & (movie_merge['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'release_year']]
good_batch_1.to_csv('../data/processed/good_batch_1.csv', index=False)
#824 unique matches on title alone
#Drops to 818 if match on year

In [68]:
manual_match_1 = good_batch_1.groupby('title', as_index=False).filter(lambda x: len(x) > 1)[['tconst', 'title', 'release_year']].drop_duplicates()
manual_match_1.to_csv('../data/processed/manual_match_1.csv', index=False)

In [69]:
fails = movie_merge[movie_merge['_merge']=='left_only']

In [73]:
imdb_all = pd.read_csv('../data/processed/imdb_all.csv').rename(columns={'startYear':'release_year'})

In [74]:
movie_merge2 = fails[['title', 'release_year']].merge(imdb_all, on=['title', 'release_year'], how='left', indicator=True)

In [78]:
good_batch_2 = movie_merge2[(movie_merge2['_merge']=='both') & (movie_merge2['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'release_year']]
good_batch_2.to_csv('../data/processed/good_batch_2.csv', index=False)

In [79]:
manual_match_2 = movie_merge2[(movie_merge2['_merge']=='both') & (movie_merge2['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) > 1)[['tconst', 'title', 'release_year']].drop_duplicates()
manual_match_2.to_csv('../data/processed/manual_match_2.csv', index=False)

In [96]:
manual_match_3 = movie_merge2[movie_merge2['_merge']=='left_only'][['title', 'release_year']]
manual_match_3.to_csv('../data/processed/manual_match_3.csv', index=False)

In [102]:
print(f"Correctly Matched: {netflix_data[~netflix_data['is_tv_show']].shape[0] - good_batch_1.shape[0] - good_batch_2.shape[0]}\n")
print(f"Incorrectly Matched Duplicates Movies: {len(manual_match_1['title'].unique())}")
print(f"Incorrectly Matched Duplicate Non-movies: {len(manual_match_2['title'].unique())}")
print(f"Incorrectly Matched No Match: {len(manual_match_3['title'].unique())}")

#Remaining 119
#23 in manual_match_1
#6 in manual_match_2
#90 in manual_match_3


Correctly Matched: 119

Incorrectly Matched Duplicates Movies: 23
Incorrectly Matched Duplicate Non-movies: 6
Incorrectly Matched No Match: 90


## Repeat for TV Shows

In [104]:
#read imdb tv data
imdb_tv_data = pd.read_csv('../data/processed/imdb_tv.csv')[['tconst', 'title', 'startYear']].rename(columns={'startYear':'release_year'})

In [110]:
tv_merge1 = netflix_data[netflix_data['is_tv_show']].merge(imdb_tv_data, on=['title'], how='left', indicator=True)

In [155]:
good_batch_tv_1 = tv_merge1[(tv_merge1['_merge']=='both') & (tv_merge1['tconst'].notnull())].groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'season', 'release_year_x']].rename(columns={'release_year_x':'release_year'})
already_matched = matched_data[matched_data['title'].isin(manual_match_tv_1['title'].unique())]
good_batch_tv_1 = pd.concat([good_batch_tv_1, already_matched], axis=0)
good_batch_tv_1.to_csv('../data/processed/good_batch_tv_1.csv', index=False)
#847 unique matches

In [167]:
manual_match_tv_1 = tv_merge1[(tv_merge1['_merge']=='both') & (tv_merge1['tconst'].notnull()) & (~tv_merge1['title'].isin(already_matched['title']))].groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) > 1)

manual_match_tv_1[['tconst', 'title', 'season', 'release_year_x']].rename(columns={'release_year_x':'release_year'}).to_csv('../data/processed/manual_match_tv_1.csv', index=False)

In [173]:
manual_match_tv_season_1 = manual_match_tv_1[manual_match_tv_1['release_year_x']==manual_match_tv_1['release_year_y']]
otherseasons = manual_match_tv_1[manual_match_tv_1['title'].isin(manual_match_tv_season_1['title'].unique())]
len(otherseasons['combined_title'].unique())
# 98 from using 1st season year match

98