# Importing Libraries

In [1]:
import pandas as pd #for reading the csv files
import numpy as np #for implementing matrix operations
import matplotlib.pyplot as plt #To visualize data graphically

# Defining File Locations

In [2]:
location1='D:\\Madhvik 05\\Industrial training files\\bollywood_2010-2019.csv'
location2='D:\\Madhvik 05\\Industrial training files\\bollywood_meta_2010-2019.csv'
location3='D:\\Madhvik 05\\Industrial training files\\bollywood_ratings_2010-2019.csv'
location4='D:\\Madhvik 05\\Industrial training files\\bollywood_text_2010-2019.csv'

# Reading CSV files

In [3]:
movie_title=pd.read_csv(location1)
movie_meta=pd.read_csv(location2)
movie_ratings=pd.read_csv(location3)
movie_text=pd.read_csv(location4)
movie_title.shape

(980, 4)

# Data preprocessing

In [4]:
#movie_dataframe=pd.DataFrame(movie_file)
#credit_dataframe=pd.DataFrame(credit_file)

In [5]:
movie_dataset=movie_title.merge(movie_ratings,on='imdb_id')
movie_dataset=movie_dataset.merge(movie_meta,on='imdb_id')
movie_dataset=movie_dataset.merge(movie_text,on='imdb_id')
movie_dataset.to_csv('D:\\Madhvik 05\\Industrial training files\\save_bollywood_movie_file.csv')
movie_dataset.shape

(934, 18)

In [6]:
movie_dataset.drop_duplicates(keep='first',inplace=True)
movie_dataset.drop(['title_y','original_title'],axis=1,inplace=True)
movie_dataset.rename(columns={'title_x':'title'},inplace=True)
movie_dataset.columns


Index(['title', 'imdb_id', 'poster_path', 'wiki_link', 'imdb_rating',
       'imdb_votes', 'is_adult', 'year_of_release', 'runtime', 'genres',
       'story', 'summary', 'tagline', 'actors', 'wins_nominations',
       'release_date'],
      dtype='object')

In [7]:
mean_votes= movie_dataset['imdb_votes'].mean()
print("mean votes=",mean_votes)
min_votes= movie_dataset['imdb_votes'].quantile(0.9)
#Selecting the movies that are having votes greater than 90th percentile
q_movies = movie_dataset.loc[movie_dataset['imdb_votes'] >= min_votes]
print("min votes= ",min_votes)
q_movies.shape



mean votes= 5959.035437430786
min votes=  16604.600000000006


(91, 16)

# Defining IMDB weighted average rating function

In [8]:
def weighted_rating(x, m=min_votes, C=mean_votes):
    v = x['imdb_votes']
    R = x['imdb_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)
    
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
 
 
#Sorting movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Printing the top 15 movies
#q_movies[['original_title', 'vote_count', 'vote_average', 'score']].reset_index(drop=True).head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
q_movies.to_csv('D:\\Madhvik 05\\Industrial training files\\save_bollywood_movie_file_with_score.csv')

In [10]:
q_movies[['title', 'imdb_votes', 'imdb_rating', 'score']].reset_index(drop=True).head(15)

Unnamed: 0,title,imdb_votes,imdb_rating,score
0,Manjhi – The Mountain Man,16622.0,8.0,2981.959509
1,Goliyon Ki Raasleela Ram-Leela,17246.0,6.4,2926.322553
2,Jab Harry Met Sejal,17317.0,5.3,2919.649425
3,Sonu Ke Titu Ki Sweety,17378.0,7.1,2915.338545
4,Ugly (film),17483.0,8.1,2906.893185
5,Secret Superstar,17698.0,7.9,2888.621096
6,Parmanu: The Story of Pokhran,18292.0,7.7,2839.481446
7,Hindi Medium,18315.0,7.8,2837.668725
8,Pyaar Ka Punchnama,18369.0,7.7,2833.246824
9,Ae Dil Hai Mushkil,18998.0,5.8,2782.31332


# Content-Based Filtering (on story)

In [27]:
#Importing TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Defining a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replacing NaN with an empty string
movie_dataset['story'] =movie_dataset['story'].fillna('')

#Constructing the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_dataset['story'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(916, 10490)

In [28]:
# Importing linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Computing the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [29]:
#Constructing a reverse map of indices and movie titles
indices = pd.Series(movie_dataset.index, index=movie_dataset['title']).drop_duplicates()

pd.DataFrame(indices).head(13)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Uri: The Surgical Strike,0
Battalion 609,1
The Accidental Prime Minister (film),2
Why Cheat India,3
Evening Shadows,4
Soni (film),5
Fraud Saiyaan,6
Bombairiya,7
Manikarnika: The Queen of Jhansi,8
Thackeray (film),9


In [30]:
def get_recomendation(title,cosine_sim =cosine_sim ):
    #Getting the index of the movie given its title
    idx=indices[title]
    #Getting the list of cosine similarity scores
    sim_scores=list(enumerate(cosine_sim[idx]))
    #sorting based on sim_score
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    # Getting the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    #Getting the  movies indices 
    sim_scores=[i[0] for i in sim_scores]
     
    return movie_dataset['title'].iloc[sim_scores]

In [44]:
user_title=input('Enter your favourite movie: ')
get_recomendation(user_title)

Enter your favourite movie: Sachin: A Billion Dreams


704              Ferrari Ki Sawaari
332    M.S. Dhoni: The Untold Story
21                         22 Yards
163                   Coffee with D
736                OMG – Oh My God!
577             Mumbai Delhi Mumbai
611           Bombay Talkies (film)
1                     Battalion 609
587                     Akaash Vani
204                     Shab (film)
Name: title, dtype: object

# Content-Based Filtering (actors)

In [19]:
#Importing TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Defining a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf2 = TfidfVectorizer(stop_words='english')

#Replacing NaN with an empty string
movie_dataset['actors'] =movie_dataset['actors'].fillna('')

#Constructing the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix2 = tfidf.fit_transform(movie_dataset['actors'])

#Output the shape of tfidf_matrix
tfidf_matrix2.shape

(916, 4302)

In [20]:
# Importing linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Computing the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix2, tfidf_matrix2)

In [21]:
#Constructing a reverse map of indices and movie titles
indices = pd.Series(movie_dataset.index, index=movie_dataset['title']).drop_duplicates()

pd.DataFrame(indices).head(13)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Uri: The Surgical Strike,0
Battalion 609,1
The Accidental Prime Minister (film),2
Why Cheat India,3
Evening Shadows,4
Soni (film),5
Fraud Saiyaan,6
Bombairiya,7
Manikarnika: The Queen of Jhansi,8
Thackeray (film),9


In [22]:
def get_recomendation(title,cosine_sim =cosine_sim ):
    #Getting the index of the movie given its title
    idx=indices[title]
    #Getting the list of cosine similarity scores
    sim_scores=list(enumerate(cosine_sim[idx]))
    #sorting based on sim_score
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    # Getting the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    #Getting the  movies indices 
    sim_scores=[i[0] for i in sim_scores]
     
    return movie_dataset[['title','actors']].iloc[sim_scores]

In [24]:
user_title=input('Enter your favourite movie: ')
i=indices[user_title]
print('Actors of ',user_title)
print(movie_dataset.loc[i]['actors'])
print('Recommended movies: ')
get_recomendation(user_title)


Enter your favourite movie: Batti Gul Meter Chalu
Actors of  Batti Gul Meter Chalu
Shahid Kapoor|Shraddha Kapoor|Divyendu Sharma|Yami Gautam|Sushmita Mukherjee|Samir Soni|Ashrut Jain|
Recommended movies: 


Unnamed: 0,title,actors
694,Vicky Donor,Ayushmann Khurrana|Yami Gautam|Annu Kapoor|Dol...
119,Mulk (film),Taapsee Pannu|Rishi Kapoor|Prateik|Ashutosh Ra...
605,Aashiqui 2,Aditya Roy Kapoor|Shraddha Kapoor|Shaad Randhawa|
705,Teri Meri Kahaani (film),Shahid Kapoor|Priyanka Chopra|Prachi Desai|Neh...
229,Haseena Parkar,Shraddha Kapoor|Ankur Bhatia|Archana Gautam|Si...
668,Chashme Baddoor (2013 film),Ali Zafar|Siddharth|Divyendu Sharma|Taapsee Pa...
819,Mausam (2011 film),Shahid Kapoor|Sonam Kapoor|Anupam Kher|Supriya...
885,Milenge Milenge,Kareena Kapoor|Shahid Kapoor|Aarti Chhabria|De...
328,Days of Tafree,Yash Soni|
575,Action Jackson (2014 film),Prabhas|Shahid Kapoor|Ajay Devgn|Sonakshi Sinh...
