In [89]:
# Importing the required libraries for preparing metadata dataframe

import os
import sys

import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

In [90]:
# While running on Kaggle, the following paths are used
movies_df = pd.read_csv('../input/movie-lens-dataset/movies.csv')
ratings_df = pd.read_csv('../input/movie-lens-dataset/ratings.csv')
tags_df = pd.read_csv('../input/movie-lens-dataset/tags.csv')

# While running on system, use these paths (uncomment these & comment the above ones)
# movies_df = pd.read_csv('../../datasets/movies.csv')
# ratings_df = pd.read_csv('../../datasets/ratings.csv')
# tags_df = pd.read_csv('../../datasets/tags.csv')

In [91]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [92]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [93]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [94]:
movies_df.info

<bound method DataFrame.info of       movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Dra

In [95]:
tags_df.info

<bound method DataFrame.info of       userId  movieId               tag   timestamp
0          2    60756             funny  1445714994
1          2    60756   Highly quotable  1445714996
2          2    60756      will ferrell  1445714992
3          2    89774      Boxing story  1445715207
4          2    89774               MMA  1445715200
...      ...      ...               ...         ...
3678     606     7382         for katie  1171234019
3679     606     7936           austere  1173392334
3680     610     3265            gun fu  1493843984
3681     610     3265  heroic bloodshed  1493843978
3682     610   168248  Heroic Bloodshed  1493844270

[3683 rows x 4 columns]>

In [96]:
# Joining the genre by replacing the '|' symbol with ' '
movies_df['genres'] = movies_df['genres'].str.replace('|',' ')

In [97]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


Merging the **Movies** & the **Tags** dataframe and creating a **metadata** tag for each movie:

In [98]:
# Merging all the columns of the Movies and Tags dataframe to be processed further
merged = pd.merge(movies_df, tags_df, on='movieId', how='left')
merged.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


In [99]:
# Creating Metadata

merged.fillna("", inplace=True)
merged = pd.DataFrame(merged.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

post_merge = pd.merge(movies_df, merged, on='movieId', how='left')

# Joining genre & tag column
post_merge ['metadata'] = post_merge[['tag','genres']].apply(lambda x: ' '.join(x), axis = 1)
post_merge[['movieId','title','metadata']].head()

Unnamed: 0,movieId,title,metadata
0,1,Toy Story (1995),pixar pixar fun Adventure Animation Children C...
1,2,Jumanji (1995),fantasy magic board game Robin Williams game A...
2,3,Grumpier Old Men (1995),moldy old Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),pregnancy remake Comedy


In [100]:
# Deleting the columns which are no longer required
del post_merge['movieId']
del post_merge['genres']
del post_merge['tag']

In [101]:
metadata_df = post_merge
metadata_df.head(10)

Unnamed: 0,title,metadata
0,Toy Story (1995),pixar pixar fun Adventure Animation Children C...
1,Jumanji (1995),fantasy magic board game Robin Williams game A...
2,Grumpier Old Men (1995),moldy old Comedy Romance
3,Waiting to Exhale (1995),Comedy Drama Romance
4,Father of the Bride Part II (1995),pregnancy remake Comedy
5,Heat (1995),Action Crime Thriller
6,Sabrina (1995),remake Comedy Romance
7,Tom and Huck (1995),Adventure Children
8,Sudden Death (1995),Action
9,GoldenEye (1995),Action Adventure Thriller


In [102]:
metadata_df.info

<bound method DataFrame.info of                                           title  \
0                              Toy Story (1995)   
1                                Jumanji (1995)   
2                       Grumpier Old Men (1995)   
3                      Waiting to Exhale (1995)   
4            Father of the Bride Part II (1995)   
...                                         ...   
9737  Black Butler: Book of the Atlantic (2017)   
9738               No Game No Life: Zero (2017)   
9739                               Flint (2017)   
9740        Bungo Stray Dogs: Dead Apple (2018)   
9741        Andrew Dice Clay: Dice Rules (1991)   

                                               metadata  
0     pixar pixar fun Adventure Animation Children C...  
1     fantasy magic board game Robin Williams game A...  
2                              moldy old Comedy Romance  
3                                  Comedy Drama Romance  
4                               pregnancy remake Comedy  
...    

In [103]:
# Removing duplicate tags and genres if present
# For example. in Toy Story, tags contained multiple occurence of 'pixar'

from collections import OrderedDict

metadata_df['Metadata'] = (metadata_df['metadata'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x)
                              .keys()).str.join(' '))
del metadata_df['metadata']


In [104]:
metadata_df.rename(columns = {'Metadata': 'metadata'})

metadata_df

Unnamed: 0,title,Metadata
0,Toy Story (1995),pixar fun Adventure Animation Children Comedy ...
1,Jumanji (1995),fantasy magic board game Robin Williams Advent...
2,Grumpier Old Men (1995),moldy old Comedy Romance
3,Waiting to Exhale (1995),Comedy Drama Romance
4,Father of the Bride Part II (1995),pregnancy remake Comedy
...,...,...
9737,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,Flint (2017),Drama
9740,Bungo Stray Dogs: Dead Apple (2018),Action Animation
