In [109]:
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import pymongo
import json
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from textblob import TextBlob, Word

In [110]:
movies_df=pd.read_csv('./input/movies_metadata.csv',low_memory=False)
credits_df=pd.read_csv('./input/credits.csv')
keywords_df=pd.read_csv("./input/keywords.csv")
ratings_df=pd.read_csv('./input/ratings.csv')

In [111]:
movies_5000df=pd.read_csv('./imdb5000/5kmovies.csv')
credits_5000df=pd.read_csv('./imdb5000/5kcredits.csv')

In [112]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [113]:
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [114]:
movies_5000df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [115]:
movies_df.shape

(45466, 24)

In [116]:
#drop duplicate rows in movies_df
movies_df=movies_df.drop_duplicates(subset="title")
movies_df=movies_df.drop_duplicates(subset='id')

In [117]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,30-10-1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,15-12-1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,22-12-1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,22-12-1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,10-02-1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [118]:
def namefun(x):
        row=movies_df[movies_df['id']==x['movieId']]
        return row['title']

In [119]:
#remove unwanted columns
columns_to_drop=["adult","genres","belongs_to_collection","homepage","imdb_id","original_language","original_title","overview","release_date","budget","video",'spoken_languages','production_companies',
       'production_countries','revenue','status','tagline','vote_average','vote_count']
movies_df = movies_df.drop(columns_to_drop, axis=1)

In [120]:
columns_to_drop=['poster_path','runtime','popularity']
movies_df = movies_df.drop(columns_to_drop, axis=1)

In [121]:
ratings_df=ratings_df.drop('timestamp',axis=1)

In [122]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


In [123]:
#removing non int id data
for index,row in movies_df.iterrows():
	try:
		row["id"]=int(row["id"])
	except:
		movies_df.drop(index,axis=0,inplace=True)
		
movies_df["id"]=pd.to_numeric(movies_df["id"])     
ratings_df["movieId"]=ratings_df["movieId"].astype(object)    

In [150]:
x=ratings_df['userId'].values

In [151]:
len(np.unique(x))

270896

In [127]:
result = pd.merge(movies_df, ratings_df, left_on='id', right_on='movieId')

In [143]:
result2 = pd.merge(movies_df, ratings_df, left_on='id', right_on='movieId')

In [139]:
result

Unnamed: 0,id,title,userId,movieId,rating
0,862,Toy Story,1923,862,3.0
1,862,Toy Story,2103,862,5.0
2,862,Toy Story,5380,862,1.0
3,862,Toy Story,6177,862,4.0
4,862,Toy Story,6525,862,4.0
...,...,...,...,...,...
10875842,111109,Century of Birthing,33940,111109,2.5
10875843,111109,Century of Birthing,172224,111109,3.0
10875844,111109,Century of Birthing,210792,111109,3.0
10875845,111109,Century of Birthing,225396,111109,3.5


In [140]:
needed_movies=movies_5000df['title'].values

In [142]:
def wanted(x):
    if (x['title'] not in needed_movies):
        result.drop

(4803,)

In [145]:
df = result[result['title'].isin(needed_movies)]

In [146]:
df

Unnamed: 0,id,title,userId,movieId,rating
0,862,Toy Story,1923,862,3.0
1,862,Toy Story,2103,862,5.0
2,862,Toy Story,5380,862,1.0
3,862,Toy Story,6177,862,4.0
4,862,Toy Story,6525,862,4.0
...,...,...,...,...,...
10744875,73981,Ayurveda: Art of Being,238736,73981,4.0
10744876,73981,Ayurveda: Art of Being,243942,73981,3.5
10744877,73981,Ayurveda: Art of Being,245739,73981,4.0
10744878,73981,Ayurveda: Art of Being,263872,73981,3.0


In [149]:
len(np.unique(df['title'].values))

1339