In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
movies=pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings['movieId'].nunique()

9724

In [5]:
#checking duplicate movie titles
movies['title'].value_counts().sort_values(ascending=False)

Emma (1996)                               2
Saturn 3 (1980)                           2
Eros (2004)                               2
Confessions of a Dangerous Mind (2002)    2
War of the Worlds (2005)                  2
                                         ..
Lone Star (1996)                          1
Vampire Hunter D (1985)                   1
What's Up, Doc? (1972)                    1
Jaws 3-D (1983)                           1
My Science Project (1985)                 1
Name: title, Length: 9737, dtype: int64

In [6]:
#checking duplicate ids
movies['movieId'].value_counts().sort_values(ascending=False).head()

83969     1
101577    1
26629     1
45062     1
79879     1
Name: movieId, dtype: int64

In [7]:
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values

In [8]:
duplicate_movies

Unnamed: 0,movieId,title,genres
650,838,Emma (1996),Comedy|Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [9]:
duplic_ids

array([   838,   2851,   6003,  26958,  32600,  34048,  64997, 144606,
       147002, 168358], dtype=int64)

In [10]:
duplicate_movies = duplicate_movies[['movieId','title']]

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())

In [13]:
review_count

Unnamed: 0,movieId
34048,50
838,30
6003,15
2851,4
64997,2
144606,1
32600,1
26958,1
147002,1
168358,1


In [14]:
review_count.reset_index(inplace=True)

In [15]:
review_count

Unnamed: 0,index,movieId
0,34048,50
1,838,30
2,6003,15
3,2851,4
4,64997,2
5,144606,1
6,32600,1
7,26958,1
8,147002,1
9,168358,1


In [16]:
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')

In [17]:
#Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])

Unnamed: 0,movieId,title,count
2,6003,Confessions of a Dangerous Mind (2002),15
7,144606,Confessions of a Dangerous Mind (2002),1
0,838,Emma (1996),30
3,26958,Emma (1996),1
4,32600,Eros (2004),1
8,147002,Eros (2004),1
1,2851,Saturn 3 (1980),4
9,168358,Saturn 3 (1980),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2


In [18]:
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'first', inplace = False)['movieId']

In [19]:
#deleting movies with duplicate ids in movies dataset
movies=movies.loc[~movies['movieId'].isin(duplicated_ids)]


In [20]:
#deleting movies with duplicate ids in ratings dataset
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [21]:
#creating list with unique genres
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

In [22]:
#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)
#removing year from titles
movies['title']=movies['title'].apply(lambda x:x[:-7])
#dropping genres and timestamp
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [23]:
movies_final=pd.merge(ratings, movies, on='movieId')

In [24]:
#Converting titles to lowercase
movies_final['title']=movies_final['title'].str.lower()


In [25]:
movies_final.to_csv('movies_final.csv',index=False)

In [32]:
movies[movies['title']=='Dark Knight']

Unnamed: 0,movieId,title,Drama,Animation,Western,Musical,Children,Comedy,IMAX,Mystery,...,Documentary,Action,Thriller,Romance,Fantasy,Film-Noir,Sci-Fi,Adventure,Horror,Crime


In [33]:
movies

Unnamed: 0,movieId,title,Drama,Animation,Western,Musical,Children,Comedy,IMAX,Mystery,...,Documentary,Action,Thriller,Romance,Fantasy,Film-Noir,Sci-Fi,Adventure,Horror,Crime
0,1,Toy Story,0,1,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
1,2,Jumanji,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,Waiting to Exhale,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
9738,193583,No Game No Life: Zero,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
9739,193585,Flint,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
