In [1]:
import pandas as pd
import re

In [2]:
netflix_data = pd.read_csv('../data/raw/netflix_branded_titles_with_release_year.csv')

In [4]:
#split combined_title into title and season
netflix_data.rename(columns={'title_desc':'combined_title'}, inplace=True)
netflix_data[['title', 'season']] = netflix_data['combined_title'].str.split(': Season ', expand=True)
netflix_data['title'] = netflix_data['title'].str.lower()
netflix_data['is_tv_show'] = netflix_data['season'].notnull()
netflix_data['release_year'] = netflix_data['release_year'].astype('int')

In [134]:
#regex to get title or season from combined_title
netflix_data['title'] = netflix_data['combined_title'].str.extract(r'(.+?)($|: Season \d+)', expand=False)[0]
netflix_data['title'] = netflix_data['title'].str.lower()
netflix_data['season'] = netflix_data['combined_title'].str.extract(r': Season (\d+)')

In [2]:
def clean_title(title):
    # Remove punctuation
    title = title.str.replace(r'[^\w\s]', '', regex=True)
    # Remove leading and trailing whitespace
    title = title.str.strip()
    # Remove leading and trailing whitespace
    title = title.str.lower()
    return title

In [145]:
#apply clean title
netflix_data['title'] = clean_title(netflix_data['title'])

In [146]:
netflix_data[netflix_data['title'].str.contains('journeys')]

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show
1336,Pokémon Journeys: The Series: Part 2,2020,20200911,True,pokémon journeys the series part 2,,False
1337,Pokémon Master Journeys: The Series: Part 1,2021,20210910,True,pokémon master journeys the series part 1,,False
1338,Pokémon Master Journeys: The Series: Part 3,2022,20220526,True,pokémon master journeys the series part 3,,False


In [2]:
yendata = pd.read_csv("../data/interim/movies_matched.csv")

In [124]:
manual_tv_shows = {'pokémon journeys: the series: part 2': 2,
    'pokémon master journeys: the series: part 1': 1,
    'pokémon master journeys: the series: part 3': 3,
    'pretend it’s a city: limited series': 1,
    "rilakkuma's theme park adventure: シーズン１": 1,
    '\u200bsaint seiya: knights of the zodiac: part ii': 2,
    'tiger & bunny 2': 2}

In [140]:
netflix_data[netflix_data['title'].str.contains('pretend it')]

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show
1348,Pretend It’s a City: Limited Series,2021,20210108,False,pretend it’s a city: limited series,,False


In [3]:
netflix_data = pd.read_csv("../data/interim/netflix_branded_titles_test.csv")

In [4]:
manual = pd.read_csv("../data/interim/multimatch_manual.csv")

In [5]:
movies_and_season_1 = netflix_data[(netflix_data['is_tv_show'] == False) | (netflix_data['season'] == 1)]
tv_shows_and_high_seasons = netflix_data[(netflix_data['is_tv_show'] == True) & (netflix_data['season'] > 1)]

In [6]:
movies_only = netflix_data[(netflix_data['is_tv_show'] == False)]
tv_shows_only = netflix_data[(netflix_data['is_tv_show'] == True)]

In [16]:
imdb_all = pd.read_csv("../data/processed/imdb_all.csv").rename(columns={'startYear':'release_year'})
imdb_all['title'] = clean_title(imdb_all['title'])

In [20]:
movie_merge = pd.merge(movies_only, imdb_all[~imdb_all['titleType'].isin(['tvSeries', 'video', 'videoGame', 'tvPilot', 'tvMiniSeries'])], on=['title', 'release_year'], how='left', indicator=True)

In [17]:
import numpy as np

In [21]:
movie_good = movie_merge[movie_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) == 1)
movie_multi = movie_merge[movie_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)['combined_title'].unique()
movie_unmatched = movie_merge[movie_merge['_merge'] == 'left_only']['combined_title'].unique()
manual[manual['combined_title'].isin(np.append(movie_multi, movie_unmatched))].to_clipboard(index=False)

In [22]:
tv_show_merge = pd.merge(tv_shows_only, imdb_all[imdb_all['titleType'].isin(['tvSeries', 'tvMiniSeries'])], on=['title'], how='left', indicator=True)

In [23]:
tv_show_good = tv_show_merge[tv_show_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) == 1)
tv_show_multi = tv_show_merge[tv_show_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)['combined_title'].unique()
tv_show_unmatched = tv_show_merge[tv_show_merge['_merge'] == 'left_only']['combined_title'].unique()
manual[manual['combined_title'].isin(np.append(tv_show_multi, tv_show_unmatched))].to_clipboard(index=False)

In [24]:
automated_tconsts = pd.concat([movie_good[['combined_title', 'tconst']], tv_show_good[['combined_title', 'tconst']]], axis=0)
manual_tconsts = pd.read_csv('../data/interim/manual.csv')

tconst_joiner = pd.concat([automated_tconsts, manual_tconsts], axis=0)

In [35]:
pd.merge(netflix_data, tconst_joiner, on=['combined_title'], how='left', indicator=True).to_csv('../data/final_data/joseph_check.csv', index=False)

In [36]:
imdb_all[imdb_all['tconst']=='tt13666778']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,release_year,endYear,runtimeMinutes,genres,title


In [14]:
imdb_all[imdb_all['tconst']=='tt19187166']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [13]:
movies_and_season_1_merge = pd.merge(movies_and_season_1, imdb_all, on=['release_year', 'title'], how='left', indicator=True)

In [59]:
#1256 everything is good
good_merges_1 = movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) == 1)
#good_merges.to_csv("../data/interim/1256_good_matches.csv")
#good_merges_1[(~good_merges_1['is_tv_show'])]['titleType'].value_counts()

In [15]:
movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)['combined_title'].unique()


array(['Alba: Season 1', 'All Together Now', 'Alone', 'Animal: Season 1',
       'As the Crow Flies: Season 1', 'Ava', 'Away: Season 1',
       'Baby Fever: Season 1', 'Bad Trip', 'Beats', 'Becoming',
       'Brave New World: Season 1', 'Break Point: Season 1', 'Bubble',
       'Canvas', 'Carter', 'Catching Killers: Season 1', 'Cobalt Blue',
       'Connected: Season 1', 'Cops and Robbers', 'Crush',
       'Cursed: Season 1', 'Cuties', 'DNA', 'Dangerous Liaisons',
       'Dark Desire: Season 1', 'Darlings', 'Dealer: Season 1', 'Deep',
       'Disclosure', 'Dracula: Season 1', 'Eden: Season 1',
       'Equinox: Season 1', 'Ethos: Season 1',
       'Everything Will Be Fine: Season 1', 'Extraction', 'Fatherhood',
       'Feel Good: Season 1', 'Fever Dream', 'Fierce',
       'Followers: Season 1', 'Forgive Us Our Trespasses', 'Found',
       'Godspeed', 'Guilty', 'HOMUNCULUS', 'Heist: Season 1',
       'Hollywood: Season 1', 'Horse Girl', 'Incantation',
       'Intimacy: Season 1', 'Into t

In [60]:
movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)['combined_title'].isin(manual['combined_title']).all()

True

In [61]:
bad_merges_1 = movies_and_season_1_merge[movies_and_season_1_merge['_merge'] != 'both']



In [63]:
bad_merges_1['combined_title'].isin(manual['combined_title']).all()

True

In [66]:
tv_shows_and_high_seasons_merge = pd.merge(tv_shows_and_high_seasons, imdb_all, on=['title'], how='left', indicator=True)

In [70]:
good2 = tv_shows_and_high_seasons[tv_shows_and_high_seasons['combined_title'].isin(manual['combined_title'])]

In [73]:
high_seasons_merge = tv_shows_and_high_seasons[~tv_shows_and_high_seasons['combined_title'].isin(manual['combined_title'])].merge(imdb_all, on=['title'], how='left', indicator=True)

In [75]:
only1match_high_seasons = high_seasons_merge.groupby('combined_title').filter(lambda x: len(x) == 1)

In [81]:
multiple_matches_high_seasons = high_seasons_merge.groupby('combined_title').filter(lambda x: len(x) > 1)

multiple_matches_high_seasons[multiple_matches_high_seasons['title'].isin(good_merges_1['title'])]

Unnamed: 0,combined_title,release_year_x,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,release_year_y,endYear,runtimeMinutes,genres,_merge
41,Alien TV: Season 2,2021,20210319,False,alien tv,2.0,True,tt10910330,short,Alien TV,Alien TV,0.0,2019.0,\N,9,"Animation,Short",both
42,Alien TV: Season 2,2021,20210319,False,alien tv,2.0,True,tt12765530,tvSeries,Alien TV,Alien TV,0.0,2020.0,2021,\N,"Animation,Comedy,Family",both
102,Blood & Water: Season 2,2021,20210924,True,blood water,2.0,True,tt14810192,short,Blood & Water,Blood & Water,0.0,2021.0,\N,\N,"Drama,Short",both
103,Blood & Water: Season 2,2021,20210924,True,blood water,2.0,True,tt9839146,tvSeries,Blood & Water,Blood & Water,0.0,2020.0,\N,53,"Drama,Mystery",both
159,Control Z: Season 2,2021,20210804,True,control z,2.0,True,tt11937662,tvSeries,Control Z,Control Z,0.0,2020.0,2022,37,"Drama,Thriller",both
160,Control Z: Season 2,2021,20210804,True,control z,2.0,True,tt3954682,short,Control Z,Control Z,0.0,2015.0,\N,15,"Animation,Drama,Romance",both
161,Control Z: Season 3,2022,20220706,True,control z,3.0,True,tt11937662,tvSeries,Control Z,Control Z,0.0,2020.0,2022,37,"Drama,Thriller",both
162,Control Z: Season 3,2022,20220706,True,control z,3.0,True,tt3954682,short,Control Z,Control Z,0.0,2015.0,\N,15,"Animation,Drama,Romance",both
255,Locke & Key: Season 2,2021,20211022,True,locke key,2.0,True,tt11577050,tvMovie,Locke & Key,Locke & Key,0.0,2017.0,\N,\N,Fantasy,both
256,Locke & Key: Season 2,2021,20211022,True,locke key,2.0,True,tt3007572,tvSeries,Locke & Key,Locke & Key,0.0,2020.0,2022,48,"Drama,Fantasy,Horror",both


In [119]:
good_merges_1[good_merges_1['title'].str.contains("rupaul")]

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge
1334,RuPaul's Secret Celebrity Drag Race: Season 1,2020,20200424,False,rupauls secret celebrity drag race,1.0,True,tt11187480,tvSeries,RuPaul's Secret Celebrity Drag Race,RuPaul's Secret Celebrity Drag Race,0.0,\N,63,"Game-Show,Reality-TV",both


In [112]:
TV_only_1 = multiple_matches_high_seasons[multiple_matches_high_seasons['titleType'].isin(['tvSeries', 'miniTVSeries'])].groupby('combined_title').filter(lambda x: len(x) == 1)

In [117]:
multiple_matches_high_seasons[multiple_matches_high_seasons['titleType'].isin(['tvSeries', 'miniTVSeries'])].groupby('combined_title').filter(lambda x: len(x) > 1)

Unnamed: 0,combined_title,release_year_x,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,release_year_y,endYear,runtimeMinutes,genres,_merge
133,Can You Hear Me?: Season 2,2020,20200113,False,can you hear me,2.0,True,tt15228538,tvSeries,Can You Hear Me?,Can You Hear Me?,0.0,2020.0,\N,20,"Drama,Thriller",both
142,Can You Hear Me?: Season 2,2020,20200113,False,can you hear me,2.0,True,tt9747016,tvSeries,Can You Hear Me,M'entends-tu?,0.0,2018.0,2021,30,"Comedy,Drama",both
362,RuPaul's Secret Celebrity Drag Race: Season 2,2022,20220812,False,rupauls secret celebrity drag race,2.0,True,tt11187480,tvSeries,RuPaul's Secret Celebrity Drag Race,RuPaul's Secret Celebrity Drag Race,0.0,2020.0,\N,63,"Game-Show,Reality-TV",both
363,RuPaul's Secret Celebrity Drag Race: Season 2,2022,20220812,False,rupauls secret celebrity drag race,2.0,True,tt21342754,tvSeries,RuPaul's Secret Celebrity Drag Race,RuPaul's Secret Celebrity Drag Race,0.0,2022.0,\N,\N,Family,both
415,The Neighbor: Season 2,2021,20210521,False,the neighbor,2.0,True,tt21810202,tvSeries,The Neighbor,Ham Sayeh,0.0,2021.0,2021,\N,Drama,both
423,The Neighbor: Season 2,2021,20210521,False,the neighbor,2.0,True,tt9731222,tvSeries,The Neighbor,El vecino,0.0,2019.0,2021,30,"Action,Comedy",both


In [28]:
dupes1[~dupes1['combined_title']

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge


In [98]:
movies_matched = pd.read_csv("../data/interim/movies_matched.csv")
tv_matched = pd.read_csv("../data/interim/tv_matched.csv")
final_match = pd.read_csv("../data/final_data/netflix_final_data.csv")

In [99]:
pd.merge(movies_matched, final_match, on=['title'])[['combined_title', 'tconst_y']].to_csv("../data/interim/multimatch_manual2.csv", index=False)
pd.merge(tv_matched, final_match, on=['title'])[['combined_title', 'tconst_y']].to_csv("../data/interim/multimatch_manual3.csv", index=False)

In [101]:
good_merges_1.titleType.value_counts()

tvSeries        519
movie           512
tvSpecial       116
tvMiniSeries     50
short            39
tvMovie          17
tvShort           2
video             1
Name: titleType, dtype: int64

In [110]:
good_merges_1[(good_merges_1['titleType']=='tvMiniSeries') & (good_merges_1['is_tv_show'] == False)]

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge
617,Furioza,2022,20220406,True,furioza,,False,tt18286666,tvMiniSeries,Furioza,Furioza,0.0,2022,40,Crime,both
1018,Measure for Measure,2021,20210101,False,measure for measure,,False,tt22456470,tvMiniSeries,Measure for Measure,Measure for Measure,0.0,2021,\N,Drama,both
1668,The Haunting of Bly Manor,2020,20201009,True,the haunting of bly manor,,False,tt10970552,tvMiniSeries,The Haunting of Bly Manor,The Haunting of Bly Manor,0.0,2020,494,"Drama,Horror,Mystery",both
1876,Tiger King: The Doc Antle Story,2021,20211210,True,tiger king the doc antle story,,False,tt16307970,tvMiniSeries,Tiger King: The Doc Antle Story,Tiger King: The Doc Antle Story,0.0,2021,127,"Biography,Crime,Documentary",both
2079,jeen-yuhs: A Kanye Trilogy,2022,20220216,True,jeenyuhs a kanye trilogy,,False,tt14599438,tvMiniSeries,Jeen-yuhs: A Kanye Trilogy,Jeen-yuhs: A Kanye Trilogy,0.0,2022,90,"Documentary,Music",both


In [38]:
multimatch1 = movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)

In [52]:
movies_and_season_1[movies_and_season_1['combined_title'].str.contains("The One:")]

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show
1879,The One: Season 1,2021,20210616,True,the one,1.0,True
1880,The One: Season 1,2021,20210312,True,the one,1.0,True


In [42]:
pd.DataFrame({"combined_title":multimatch1['combined_title'].unique()}).to_csv("../data/interim/multimatch_manual.csv")

In [53]:
nomatch = movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'left_only']

In [54]:
nomatch

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge
2,10 Days With Santa Claus,2020,20201204,False,10 days with santa claus,,False,,,,,,,,,left_only
3,100 Humans: Season 1,2020,20200313,False,100 humans,1.0,True,,,,,,,,,left_only
11,37 Seconds,2020,20200131,False,37 seconds,,False,,,,,,,,,left_only
16,7SEEDS: Part 2,2020,20200326,True,7seeds part 2,,False,,,,,,,,,left_only
17,8,2020,20200612,False,8,,False,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2066,Young Wallander: Killer's Shadow,2022,20220217,True,young wallander killers shadow,,False,,,,,,,,,left_only
2086,the goop lab with Gwyneth Paltrow: Season 1,2020,20200124,False,the goop lab with gwyneth paltrow,1.0,True,,,,,,,,,left_only
2087,"tick, tick...BOOM!",2021,20211119,True,tick tickboom,,False,,,,,,,,,left_only
2088,Òlòtūré,2020,20201002,False,òlòtūré,,False,,,,,,,,,left_only


In [40]:
multimatch1.to_csv("../data/interim/multimatch1.csv")

In [37]:
good_merges_1[good_merges_1['titleType']=='short']

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge
46,A StoryBots Space Adventure,2021,20210914,False,a storybots space adventure,,False,tt15166478,short,A StoryBots Space Adventure,A StoryBots Space Adventure,0.0,\N,12,"Adventure,Animation,Comedy",both
170,Audible,2021,20210701,False,audible,,False,tt12771540,short,Audible,Audible,0.0,\N,39,"Documentary,Short,Sport",both
218,Behind the Scenes With Jane Campion,2022,20220127,True,behind the scenes with jane campion,,False,tt18184270,short,Behind the Scenes with Jane Campion,Behind the Scenes with Jane Campion,0.0,\N,17,"Documentary,Short",both
246,Blood Ties: Season 1,2022,20220418,False,blood ties,1.0,True,tt20508036,short,Blood Ties,Blood Ties,0.0,\N,\N,"Mystery,Short",both
274,Bruised,2021,20211124,True,bruised,,False,tt15407686,short,Bruised,Bruised,0.0,\N,\N,Short,both
294,Camp Confidential: America's Secret Nazis,2021,20211102,True,camp confidential americas secret nazis,,False,tt15470770,short,Camp Confidential: America's Secret Nazis,Camp Confidential: America's Secret Nazis,0.0,\N,36,"Animation,Documentary,History",both
302,Captain Underpants Epic Choice-o-Rama,2020,20200211,False,captain underpants epic choiceorama,,False,tt11604750,short,Captain Underpants: Epic Choice-o-Rama,Captain Underpants: Epic Choice-o-Rama,0.0,\N,37,"Action,Adventure,Animation",both
306,Carmen Sandiego: To Steal or Not to Steal,2020,20200310,True,carmen sandiego to steal or not to steal,,False,tt11767524,short,Carmen Sandiego: To Steal or Not to Steal,Carmen Sandiego: To Steal or Not to Steal,0.0,\N,31,"Action,Adventure,Animation",both
311,Cat Burglar,2022,20220222,False,cat burglar,,False,tt17321170,short,Cat Burglar,Cat Burglar,0.0,\N,12,"Animation,Comedy,Short",both
322,Chadwick Boseman: Portrait of an Artist,2021,20210417,False,chadwick boseman portrait of an artist,,False,tt14476720,short,Chadwick Boseman: Portrait of an Artist,Chadwick Boseman: Portrait of an Artist,0.0,\N,21,"Documentary,Short",both


In [98]:
len(movies_and_season_1_merge[movies_and_season_1_merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)['combined_title'].unique()) #125 titles have multiple matches oh no!

#multimatch = merge[merge['_merge'] == 'both'].groupby('combined_title').filter(lambda x: len(x) > 1)

125

In [75]:
manual_tv_matches = pd.read_csv("../data/interim/tv_matched.csv")
manual_tv_matches['title'] = clean_title(manual_tv_matches['title'])    

manual_movie_matches = pd.read_csv("../data/interim/movies_matched.csv")
manual_movie_matches['title'] = clean_title(manual_movie_matches['title'])

manual_other_matches = pd.read_csv('../data/raw/matched_wIMDB.tsv', sep='\t')
manual_other_matches['title'] = clean_title(manual_other_matches['title'])

In [77]:
multimatch['title'].drop_duplicates().isin(manual_tv_matches['title']).sum() #29
multimatch['title'].drop_duplicates().isin(manual_movie_matches['title']).sum() #28

28

In [85]:
netflix_data[netflix_data['title']=='after life']

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show
67,After Life: Season 2,2020,20200424,True,after life,2.0,True
68,After Life: Season 3,2022,20220114,True,after life,3.0,True


In [87]:
imdb_all[imdb_all['tconst']=='tt8398600']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,release_year,endYear,runtimeMinutes,genres,title
651397,tt8398600,tvSeries,After Life,After Life,0,2019.0,2022,30,"Comedy,Drama",after life


In [66]:
multimatch.tail(20)

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,titleType,primaryTitle,originalTitle,isAdult,endYear,runtimeMinutes,genres,_merge
2493,We Can Be Heroes,2020,20201225,True,we can be heroes,,False,tt11937426,short,We Can Be Heroes,We Can Be Heroes,0.0,\N,\N,"Drama,Short",both
2497,Wedding Season,2022,20220804,True,wedding season,,False,tt11426572,movie,Wedding Season,Wedding Season,0.0,\N,98,"Comedy,Romance",both
2498,Wedding Season,2022,20220804,True,wedding season,,False,tt15114768,tvSeries,Wedding Season,Wedding Season,0.0,\N,\N,"Action,Adventure,Comedy",both
2508,Whipped,2020,20200918,False,whipped,,False,tt11754464,movie,Whipped,Bucin,0.0,\N,97,"Comedy,Drama,Romance",both
2509,Whipped,2020,20200918,False,whipped,,False,tt12792836,short,Whipped,Whipped,0.0,\N,7,"Comedy,Drama,Short",both
2563,You've Got This,2020,20201002,False,youve got this,,False,tt12635086,short,You've Got This,You've Got This,0.0,\N,\N,"Music,Short",both
2564,You've Got This,2020,20201002,False,youve got this,,False,tt13118012,movie,You've Got This,Ahí te Encargo,0.0,\N,111,"Comedy,Romance",both
2565,You: Season 3,2021,20211015,True,you,3.0,True,tt12497058,short,You,You,0.0,\N,\N,"Drama,Short",both
2566,You: Season 3,2021,20211015,True,you,3.0,True,tt13849012,video,You,You,0.0,\N,\N,"Music,Short",both
2567,You: Season 3,2021,20211015,True,you,3.0,True,tt14530626,short,You!,You!,0.0,\N,7,"Adventure,Short",both


In [64]:
imdb_all

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,release_year,endYear,runtimeMinutes,genres,title
0,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,\N,"Action,Crime",tötet nicht mehr
1,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,\N,133,Documentary,istoriya grazhdanskoy voyny
2,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,\N,6,Short,color rhapsodie
3,tt0044326,short,Abstronic,Abstronic,0,2021.0,\N,6,Short,abstronic
4,tt0044879,short,Mandala,Mandala,0,2021.0,\N,3,Short,mandala
...,...,...,...,...,...,...,...,...,...,...
728908,tt9916724,short,Hay Que Ser Paciente,Hay Que Ser Paciente,0,2015.0,\N,3,"Documentary,Short",hay que ser paciente
728909,tt9916730,movie,6 Gunn,6 Gunn,0,2017.0,\N,116,\N,6 gunn
728910,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019.0,\N,\N,Short,pretty pretty black girl
728911,tt9916764,short,38,38,0,2018.0,\N,\N,Short,38


In [5]:
##read imdb movie data
imdb_movie_data = pd.read_csv('../data/processed/imdb_movies.csv')

In [8]:
imdb_movie_data['title'] = clean_title(imdb_movie_data['title'])

In [9]:
#read matched data
matched_data = pd.read_csv('../data/raw/matched_wIMDB.tsv', sep='\t')[['tconst', 'title', 'release_year', 'season']]

In [13]:
imdb_clean_movie = imdb_movie_data[['tconst', 'title', 'startYear']].rename(columns={'startYear':'release_year'})

In [20]:
imdb_movie_data[imdb_movie_data['tconst']=='tt13353456']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,title


In [18]:
imdb_clean_movie[imdb_clean_movie['tconst']=='tt13353456']

Unnamed: 0,tconst,title,release_year


In [16]:
merge

Unnamed: 0,combined_title,release_year,release_date,is_top10,title,season,is_tv_show,tconst,_merge
0,#blackAF: Season 1,2020,20200417,False,blackaf,1.0,True,,left_only
1,(Un)Well: Season 1,2020,20200812,False,unwell,1.0,True,,left_only
2,10 Days With Santa Claus,2020,20201204,False,10 days with santa claus,,False,,left_only
3,100 Humans: Season 1,2020,20200313,False,100 humans,1.0,True,,left_only
4,1000 Miles from Christmas,2021,20211224,True,1000 miles from christmas,,False,tt13782052,both
...,...,...,...,...,...,...,...,...,...
2256,jeen-yuhs: A Kanye Trilogy,2022,20220216,True,jeenyuhs a kanye trilogy,,False,,left_only
2257,the goop lab with Gwyneth Paltrow: Season 1,2020,20200124,False,the goop lab with gwyneth paltrow,1.0,True,,left_only
2258,"tick, tick...BOOM!",2021,20211119,True,tick tickboom,,False,,left_only
2259,Òlòtūré,2020,20201002,False,òlòtūré,,False,,left_only


In [7]:
imdb_clean2 = pd.concat([imdb_clean_movie, matched_data], axis=0)

In [8]:
movie_merge = netflix_data[~netflix_data['is_tv_show']].merge(imdb_clean2, on=['title', 'release_year'], how='left', indicator=True)

In [9]:
good_batch_1 = movie_merge[(movie_merge['_merge']=='both') & (movie_merge['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'release_year']]
good_batch_1.to_csv('../data/processed/good_batch_1.csv', index=False)
#824 unique matches on title alone
#Drops to 818 if match on year

In [11]:
manual_match_1 = movie_merge[(movie_merge['_merge']=='both') & (movie_merge['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) > 1)[['tconst', 'title', 'release_year']]
manual_match_1.to_csv('../data/processed/manual_match_1.csv', index=False)

In [12]:
manual_match_1.to_clipboard()

In [314]:
fails = movie_merge[movie_merge['_merge']=='left_only']

In [315]:
imdb_all = pd.read_csv('../data/processed/imdb_all.csv').rename(columns={'startYear':'release_year'})

In [316]:
movie_merge2 = fails[['title', 'release_year']].merge(imdb_all, on=['title', 'release_year'], how='left', indicator=True)

In [317]:
good_batch_2 = movie_merge2[(movie_merge2['_merge']=='both') & (movie_merge2['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'release_year']]
good_batch_2.to_csv('../data/processed/good_batch_2.csv', index=False)

In [318]:
manual_match_2 = movie_merge2[(movie_merge2['_merge']=='both') & (movie_merge2['tconst'].notnull())].groupby('title', as_index=False).filter(lambda x: len(x) > 1)[['tconst', 'title', 'release_year']].drop_duplicates()
manual_match_2.to_csv('../data/processed/manual_match_2.csv', index=False)

In [319]:
manual_match_3 = movie_merge2[movie_merge2['_merge']=='left_only'][['title', 'release_year']]
manual_match_3.to_csv('../data/processed/manual_match_3.csv', index=False)

In [102]:
print(f"Correctly Matched: {netflix_data[~netflix_data['is_tv_show']].shape[0] - good_batch_1.shape[0] - good_batch_2.shape[0]}\n")
print(f"Incorrectly Matched Duplicates Movies: {len(manual_match_1['title'].unique())}")
print(f"Incorrectly Matched Duplicate Non-movies: {len(manual_match_2['title'].unique())}")
print(f"Incorrectly Matched No Match: {len(manual_match_3['title'].unique())}")

#Remaining 119
#23 in manual_match_1
#6 in manual_match_2
#90 in manual_match_3


Correctly Matched: 119

Incorrectly Matched Duplicates Movies: 23
Incorrectly Matched Duplicate Non-movies: 6
Incorrectly Matched No Match: 90


## Repeat for TV Shows

In [192]:
#read imdb tv data
imdb_tv_data = pd.read_csv('../data/processed/imdb_tv.csv')[['tconst', 'title', 'startYear']].rename(columns={'startYear':'release_year'})

1021 Shows

In [210]:
#netflix_data[netflix_data['is_tv_show']].shape

(1121, 7)

In [234]:
tv_merge1 = netflix_data[netflix_data['is_tv_show']].merge(imdb_tv_data, on=['title'], how='left', indicator=True)

Somehow we already lost 4. I know 3 are because of duplicate title + season, but not sure where the last 1 went.

Left with 1117

In [235]:
#tv_merge1[['title', 'season']].drop_duplicates().shape

In [236]:
good_batch_tv_1 = tv_merge1[(tv_merge1['_merge']=='both') & (tv_merge1['tconst'].notnull())].groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'season', 'release_year_x']].rename(columns={'release_year_x':'release_year'})
manual_match_tv_1 = tv_merge1[(tv_merge1['_merge']=='both') & (tv_merge1['tconst'].notnull())].groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) > 1)[['tconst', 'title', 'season', 'release_year_x']].rename(columns={'release_year_x':'release_year'})

In [245]:
print(f"Matched 1:1: {good_batch_tv_1.shape[0]}")
print(f"Matched 1:M: {manual_match_tv_1[['title', 'season']].drop_duplicates().shape[0]}")

Matched 1:1: 878
Matched 1:M: 154


Immediately 878 match 1:1, 154 match 1:M, 85 don't match. Everything OK up to here

In [247]:
#chip away at some of the manual_tv_1 matches using zach match
#already_matched_titles = manual_match_tv_1[manual_match_tv_1['title'].isin(matched_data['title'])]['title'].unique()
#manual_match_tv_2 = manual_match_tv_1[~manual_match_tv_1['title'].isin(already_matched_titles)]
#manual_match_tv_2.to_csv('../data/processed/manual_match_tv_1.csv', index=False)

In [218]:
#add the already matched to the good batch
#already_matched_df = matched_data[matched_data['title'].isin(already_matched_titles)][['tconst', 'title', 'season', 'release_year']]
#good_batch_tv_1 = pd.concat([good_batch_tv_1, already_matched_df], axis=0)
#good_batch_tv_1.to_csv('../data/processed/good_batch_tv_1.csv', index=False)

Of the 154 1:M, 9 are already matched via Zach's list. 145 remain

In [253]:
#9 matches
good_batch_tv_2 = manual_match_tv_1[['title', 'season', 'release_year']].merge(matched_data, on=['title', 'season', 'release_year'], how='inner').drop_duplicates()
good_batch_tv_2.to_csv('../data/processed/good_batch_tv_2.csv', index=False)

Of the 145 remaining, 83 match on season 1 release year. 62 remaining

In [291]:
#Clean more manual matches using release year with season 1
good_batch_tv_3 = manual_match_tv_1[['title', 'season', 'release_year']].drop_duplicates().merge(imdb_tv_data, on=['title', 'release_year'], how='inner')
good_batch_tv_3 = good_batch_tv_3.groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'season', 'release_year']]
good_batch_tv_3.to_csv('../data/processed/good_batch_tv_3.csv', index=False)
print(good_batch_tv_3.shape)

(83, 4)


62 remaining manual matches with potential ttvalues

In [298]:
checker = pd.concat([good_batch_tv_2, good_batch_tv_3], axis=0).merge(manual_match_tv_1, on=['title', 'season'], how='outer', indicator=True)
remaining_manual_1 = checker[checker['_merge']=='right_only'][['title', 'season', 'release_year_y', 'tconst_y']].rename(columns={'release_year_y':'release_year', 'tconst_y':'tconst'})

remaining_manual_1.to_csv('../data/processed/manual_match_tv_1_updated.csv', index=False)

In [221]:
# 1121 tv shows
# 887 matched 
# 145 multiple ttcodes


Move on to the 85 non-matches

In [246]:
fails_tv = tv_merge1[tv_merge1['_merge']=='left_only']
print(fails_tv.shape)

(85, 10)


In [300]:
#Now try matching failed matches using zach's data
tv_merge2 = fails_tv[['title', 'season', 'release_year_x']].merge(matched_data, on=['title', 'season'], how='left', indicator=True)
good_batch_tv_4 = tv_merge2[(tv_merge2['_merge']=='both') & (tv_merge2['tconst'].notnull())].groupby(['title', 'season'], as_index=False).filter(lambda x: len(x) == 1)[['tconst', 'title', 'season', 'release_year']].rename(columns={'release_year_x':'release_year'})
manual_match_tv_2 = tv_merge2[(tv_merge2['_merge']=='left_only')][['title', 'season', 'release_year_x']].rename(columns={'release_year_x':'release_year'})

66 Match with Zach's list. 19 remaining

In [302]:
good_batch_tv_4.to_csv('../data/processed/good_batch_tv_2.csv', index=False)
#66 good matches

19 Remaining need a manual match

In [304]:
#manual_match_tv_2.shape
manual_match_tv_2.to_csv('../data/processed/manual_match_tv_2.csv', index=False)

Combined Required Matches into 1

In [305]:
pd.concat([remaining_manual_1, manual_match_tv_2], axis=0).to_csv('../data/interim/tv_need_matches.csv', index=False)

In [320]:
pd.concat([manual_match_1, manual_match_2, manual_match_3], axis=0).to_csv('../data/interim/movies_need_matches.csv', index=False)

In [None]:
pd.read_csv('../data/interim/movies_need_matches.csv').shape