In [1]:
import numpy as np
import pandas as pd

### EDA + cleaning

In [2]:
## Read filmweb data
df_filmweb = pd.read_csv("data/filmweb.csv")
df_filmweb.head()

Unnamed: 0,title,originalTitle,description,genre
0,Zielona mila,The Green Mile,Emerytowany strażnik więzienny opowiada przyja...,Dramat
1,Skazani na Shawshank,The Shawshank Redemption,Adaptacja opowiadania Stephena Kinga. Niesłusz...,Dramat
2,Forrest Gump,,"Historia życia Forresta, chłopca o niskim ilor...",Dramat / Komedia
3,Leon zawodowiec,Léon,Płatny morderca ratuje dwunastoletnią dziewczy...,Dramat / Kryminał
4,Requiem dla snu,Requiem for a Dream,"Historia czwórki bohaterów, dla których używki...",Dramat


In [3]:
## Read train imdb data
df_imdb_1 = pd.read_csv('data/train_data.txt', sep=":::", header=None, engine='python')
df_imdb_1.columns = ['id', 'title', 'genre', 'description']
df_imdb_1 = df_imdb_1.drop(columns=["id"])

In [4]:
## Read test imdb data
df_imdb_2 = pd.read_csv('data/test_data_solution.txt', sep=":::", header=None, engine='python')
df_imdb_2.columns = ['id', 'title', 'genre', 'description']
df_imdb_2 = df_imdb_2.drop(columns=["id"])

In [5]:
## Combine the imdb datasets to create one big dataset
df_imdb_all = pd.concat([df_imdb_1, df_imdb_2], axis=0).reset_index(drop=True)

In [6]:
df_imdb_all

Unnamed: 0,title,genre,description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
108409,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
108410,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
108411,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
108412,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [7]:
## Check for Nan's

In [8]:
df_imdb_all.isna().sum()

title          0
genre          0
description    0
dtype: int64

In [9]:
## We can get the "originalTitle" from the title column
df_filmweb.isna().sum()

title               0
originalTitle    2366
description         0
genre               0
dtype: int64

In [10]:
## If the "originalTitle" is None then the "title" is the "originalTitle"
df_filmweb["originalTitle"][df_filmweb["originalTitle"].isna()] = df_filmweb["title"][df_filmweb["originalTitle"].isna()]

## Let's also drop the "title" column because it's no longer needed and rename the "originalTitle" column to "title"
df_filmweb = df_filmweb.drop(columns="title").rename({"originalTitle": "title"}, axis=1)

In [11]:
## Check unique genres in each move database

In [12]:
df_imdb_all.genre.unique()

array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [13]:
## Some movies belong to multiply genres, let's split them so each movie with multiple genres
## occupy the proper number of rows (that later)
df_filmweb.genre.unique()

array(['Dramat', 'Dramat / Komedia', 'Dramat / Kryminał',
       'Akcja / Sci-Fi', 'Thriller', 'Dramat historyczny', 'Sci-Fi',
       'Animacja / Familijny / Komedia', 'Dramat / Wojenny',
       'Thriller / Psychologiczny', 'Gangsterski',
       'Melodramat / Katastroficzny', 'Biograficzny / Dramat / Komedia',
       'Dramat / Gangsterski', 'Surrealistyczny / Thriller / Sci-Fi',
       'Kryminał / Thriller', 'Dramat / Thriller',
       'Biograficzny / Dramat / Wojenny', 'Fantasy / Przygodowy',
       'Komedia', 'Wojenny', 'Biograficzny / Dramat', 'Komedia / Sci-Fi',
       'Dramat / Komedia / Psychologiczny', 'Fantasy / Melodramat',
       'Dramat / Psychologiczny', 'Thriller / Sci-Fi', 'Horror',
       'Western', 'Dramat / Komedia / Sci-Fi', 'Animacja / Familijny',
       'Biograficzny / Komedia kryminalna', 'Horror / Sci-Fi',
       'Kryminał / Przygodowy', 'Dramat / Fantasy', 'Kryminał',
       'Dramat / Komedia / Wojenny', 'Fantasy / Komedia',
       'Biograficzny / Dramat / Przygo

In [14]:
## preprocessing 
def clean_titles(ser):
    ser = ser.copy()


    ser = (ser
    .str
    .replace(pat=r"(\([a-zA-Z0-9./?]+\))|([\'\"])", repl="", regex=True)
    .str
    .strip()
    .str
    .normalize("NFKD")
    .str
    .encode("ascii", errors="ignore")
    .str
    .decode("utf-8")
    .str
    .lower()
    )
    return ser

def explode_genres_filmweb(df_):
    
    ## Słowniczek
    mapping = {"dramat": "drama", "komedia": "comedy", "dokumentalny": "documentary", "krótkometrażowy": "short",
              "akcja": "action", "melodramat": "drama", "familijny": "family", "przygodowy": "adventure",
              "muzyczny": "music", "romans": "romance",  "sensacyjny": "thriller", "biograficzny": "biography",
              "animacja": "animation", "dramat obyczajowy": "drama", "dramat historyczny": "drama",
              "dramat sądowy": "drama", "wojenny": "war", "kryminał": "crime", "psychologiczny": "thriller"}
    
    df_ = df_.copy()
    
    df_ = (df_
          .assign(genre=lambda df_d: df_d
                  ["genre"]
                  .str
                  .split(" / "))
          .explode("genre")
          .assign(genre=lambda df_d: df_d
                 ["genre"]
                 .str
                 .lower()
                 .replace(mapping)))
    
    
    return df_

In [15]:
## Cleanin'
df_filmweb["title"] = df_filmweb["title"].pipe(clean_titles)

In [16]:
df_imdb_all["title"] = df_imdb_all["title"].pipe(clean_titles)

In [17]:
df_filmweb

Unnamed: 0,title,description,genre
0,the green mile,Emerytowany strażnik więzienny opowiada przyja...,Dramat
1,the shawshank redemption,Adaptacja opowiadania Stephena Kinga. Niesłusz...,Dramat
2,forrest gump,"Historia życia Forresta, chłopca o niskim ilor...",Dramat / Komedia
3,leon,Płatny morderca ratuje dwunastoletnią dziewczy...,Dramat / Kryminał
4,requiem for a dream,"Historia czwórki bohaterów, dla których używki...",Dramat
...,...,...,...
9938,dark angel,Policyjny detektyw zostaje wplątany w zagrażaj...,Thriller / Sci-Fi
9939,pozegnanie z maria,Grupa młodych ludzi próbuje zachować resztki n...,Melodramat / Wojenny
9940,rh+,Rh (+) to opowieść o grupie dwudziestokilkule...,Kryminał / Sensacyjny
9941,wet hot american summer,"Grupa opiekunów letniego obozu, podczas ostatn...",Komedia rom.


In [18]:
df_imdb_all = df_imdb_all.iloc[:, [0,2,1]]
df_imdb_all["genre"] = df_imdb_all["genre"].str.strip()
df_imdb_all

Unnamed: 0,title,description,genre
0,oscar et la dame rose,Listening in to a conversation between his do...,drama
1,cupid,A brother and sister with a past incestuous r...,thriller
2,"young, wild and wonderful",As the bus empties the students for their fie...,adult
3,the secret sin,To help their unemployed father make ends mee...,drama
4,the unrecovered,The film's title refers not only to the un-re...,drama
...,...,...,...
108409,tales of light & dark,"Covering multiple genres, Tales of Light & Da...",horror
108410,der letzte mohikaner,As Alice and Cora Munro attempt to find their...,western
108411,oliver twink,A movie 169 years in the making. Oliver Twist...,adult
108412,slipstream,"Popular, but mysterious rock D.J Mike Mallard...",drama


In [19]:
## Explodin'
df_filmweb = df_filmweb.pipe(explode_genres_filmweb)

In [20]:
df_filmweb

Unnamed: 0,title,description,genre
0,the green mile,Emerytowany strażnik więzienny opowiada przyja...,drama
1,the shawshank redemption,Adaptacja opowiadania Stephena Kinga. Niesłusz...,drama
2,forrest gump,"Historia życia Forresta, chłopca o niskim ilor...",drama
2,forrest gump,"Historia życia Forresta, chłopca o niskim ilor...",comedy
3,leon,Płatny morderca ratuje dwunastoletnią dziewczy...,drama
...,...,...,...
9940,rh+,Rh (+) to opowieść o grupie dwudziestokilkule...,thriller
9941,wet hot american summer,"Grupa opiekunów letniego obozu, podczas ostatn...",komedia rom.
9942,the sicilian,Wyjęty spod prawa Salvatore Giuliano razem z g...,biography
9942,the sicilian,Wyjęty spod prawa Salvatore Giuliano razem z g...,drama


In [21]:
unique_genres_imdb = df_imdb_all.genre.str.strip().unique()

In [22]:
boolean_mask = df_filmweb.genre.isin(unique_genres_imdb)

In [23]:
df_filmweb = df_filmweb[boolean_mask].reset_index(drop=True)

In [24]:
df_filmweb.groupby("genre").size().sort_values(ascending=False)

genre
drama          4522
comedy         2162
thriller       1818
action          794
horror          783
sci-fi          761
adventure       720
crime           549
biography       539
fantasy         500
family          462
romance         417
animation       393
war             304
documentary     278
music           252
musical         149
short           135
western          95
dtype: int64

In [25]:
df_imdb_all.groupby("genre").size().sort_values(ascending=False)

genre
drama          27225
documentary    26192
comedy         14893
short          10145
horror          4408
thriller        3181
action          2629
western         2064
reality-tv      1767
family          1567
adventure       1550
music           1462
romance         1344
sci-fi          1293
adult           1180
crime           1010
animation        996
sport            863
talk-show        782
fantasy          645
mystery          637
musical          553
biography        529
history          486
game-show        387
news             362
war              264
dtype: int64

In [37]:
def down_sample_both_dfs(df1, df2):
    same_genres = set(df1.genre) & set(df2.genre)
    print(same_genres)
    groupped_df1 = df1.groupby("genre")
    groupped_df2 = df2.groupby("genre")
    
    
    groupped_df1_sizes = groupped_df1.size()
    groupped_df2_sizes = groupped_df2.size()
    
    mins_for_every_genre = {key: min(groupped_df1_sizes[key], groupped_df2_sizes[key]) for key in same_genres}
    print(mins_for_every_genre)
    
    df1_new_beggining = pd.DataFrame()
    df2_new_beggining = pd.DataFrame()
    
    for key, value in mins_for_every_genre.items():
#         print(groupped_df1[key].sample(n=value))
        df1_new_beggining = pd.concat([df1_new_beggining, df1[df1["genre"] == key].sample(n=value)], axis=0)
        df2_new_beggining = pd.concat([df2_new_beggining, df2[df2["genre"] == key].sample(n=value)], axis=0)
    return df1_new_beggining.reset_index(drop=True), df2_new_beggining.reset_index(drop=True)
    

In [38]:
df_filmweb_sampled, df_imdb_all_sampled = down_sample_both_dfs(df_filmweb, df_imdb_all)

{'drama', 'musical', 'adventure', 'thriller', 'comedy', 'romance', 'sci-fi', 'biography', 'crime', 'western', 'short', 'family', 'action', 'documentary', 'music', 'animation', 'war', 'horror', 'fantasy'}
{'drama': 4522, 'musical': 149, 'adventure': 720, 'thriller': 1818, 'comedy': 2162, 'romance': 417, 'sci-fi': 761, 'biography': 529, 'crime': 549, 'western': 95, 'short': 135, 'family': 462, 'action': 794, 'documentary': 278, 'music': 252, 'animation': 393, 'war': 264, 'horror': 783, 'fantasy': 500}


In [39]:
(df_filmweb_sampled.groupby("genre").size() == df_imdb_all_sampled.groupby("genre").size()).all()

True

In [40]:
df_filmweb_sampled

Unnamed: 0,title,description,genre
0,cleo de 5 a 7,"Cleo (Corinne Marchand), młoda dziewczyna, pi...",drama
1,shot caller,"Jacob opuszcza więzienie, lecz nie może wrócić...",drama
2,au revoir la-haut,"Gdy, krótko po zakończeniu I wojny światowej, ...",drama
3,backdraft,Dwaj strażacy próbują odnaleźć sprawcę serii z...,drama
4,u pana boga za piecem,Młoda Rosjanka zostaje okradziona po przekrocz...,drama
...,...,...,...
15578,rekopis znaleziony w saragossie,"Młody oficer wyrusza do Madrytu, by objąć stan...",fantasy
15579,practical magic,Dwie siostry wychowane przez ciotki w domu peł...,fantasy
15580,ugetsu monogatari,Opowieści księżycowe to romantyczna i posępna...,fantasy
15581,what dreams may come,Po tragicznej śmierci dzieci Annie popełnia sa...,fantasy


In [41]:
df_imdb_all_sampled

Unnamed: 0,title,description,genre
0,hello & goodbye,With wounds still open from the recent end of...,drama
1,where eskimos live,"Sharkey, part of the sinister world of child ...",drama
2,love is war,I en botanisk hage mřter Gro sin Espen. Det f...,drama
3,lauras toys,Archaeologist Walter and his wife Laura are w...,drama
4,santa mesa,"Following his mother's death, Hector arrives ...",drama
...,...,...,...
15578,the mystical adventures of billy owens,Billy Owens is turning 11. Turns out this lit...,fantasy
15579,primal man: artist in motion - naked sketch ar...,"In the tradition of Edward Muybridge, Directo...",fantasy
15580,the last dragon,"An alternate scientific history, in which dra...",fantasy
15581,mahou sentai magirenja,Forces from Infershia attack the surface worl...,fantasy
