# Problem Statement :
Build a movie recommendation system using colaborative filtering and machine learning techniques in Python

# Importing necessary Libraries

In [1]:
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import ast  # For literal evaluation of strings

# Loading datasets

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')  # Loading movies dataset
credits = pd.read_csv('tmdb_5000_credits.csv')  # Loading credits data

In [3]:
movies.sample(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
3278,0,"[{""id"": 18, ""name"": ""Drama""}]",,45791,"[{""id"": 494, ""name"": ""father son relationship""...",en,And When Did You Last See Your Father?,The story of a son's conflicting memories of h...,1.939054,"[{""name"": ""Intandem Films"", ""id"": 2731}, {""nam...","[{""iso_3166_1"": ""IE"", ""name"": ""Ireland""}, {""is...",2007-10-05,92,92.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Between every father and his son there is a st...,When Did You Last See Your Father?,6.9,13
2867,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",,19419,[],en,Screwed,A chauffeur kidnaps his rich boss's dog to hol...,5.359381,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...",[],2000-05-12,0,81.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Screwed,4.9,23
3343,7000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...",http://www.mgm.com/view/movie/1130/Live-and-Le...,253,"[{""id"": 212, ""name"": ""london england""}, {""id"":...",en,Live and Let Die,James Bond must investigate a mysterious murde...,30.465138,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",1973-07-05,161777836,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Roger Moore is James Bond.,Live and Let Die,6.4,533
173,130000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 35, ""...",http://happyfeettwo.warnerbros.com/index.html,65759,"[{""id"": 3028, ""name"": ""penguin""}, {""id"": 4344,...",en,Happy Feet Two,Mumble the penguin has a problem: his son Erik...,17.7735,"[{""name"": ""Village Roadshow Pictures"", ""id"": 7...","[{""iso_3166_1"": ""AU"", ""name"": ""Australia""}]",2011-11-17,150406466,100.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Every step counts.,Happy Feet Two,5.8,373
769,60000000,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 16, ""...",http://sinbad-themovie.com/main.html,14411,"[{""id"": 3071, ""name"": ""prince""}, {""id"": 7994, ...",en,Sinbad: Legend of the Seven Seas,The sailor of legend is framed by the goddess ...,18.815442,"[{""name"": ""DreamWorks Animation"", ""id"": 521}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2003-07-02,26288320,86.0,"[{""iso_639_1"": ""cn"", ""name"": ""\u5e7f\u5dde\u8b...",Released,,Sinbad: Legend of the Seven Seas,6.6,372


In [4]:
credits.sample(5)

Unnamed: 0,movie_id,title,cast,crew
1091,10858,Nixon,"[{""cast_id"": 1, ""character"": ""Richard Nixon"", ...","[{""credit_id"": ""52fe43c59251416c7501d705"", ""de..."
3641,14474,The Oh in Ohio,"[{""cast_id"": 1, ""character"": ""Priscilla Chase""...","[{""credit_id"": ""545daf74c3a368536b0051d5"", ""de..."
1697,87,Indiana Jones and the Temple of Doom,"[{""cast_id"": 4, ""character"": ""Indiana Jones"", ...","[{""credit_id"": ""52fe4215c3a36847f8002c45"", ""de..."
1786,1551,Flatliners,"[{""cast_id"": 1, ""character"": ""Nelson"", ""credit...","[{""credit_id"": ""564888e09251413e7f006ed4"", ""de..."
3146,10306,Poltergeist III,"[{""cast_id"": 1, ""character"": ""Bruce Gardner"", ...","[{""credit_id"": ""52fe43569251416c7500c78d"", ""de..."


# Merging movies and credits datasets on 'title' column

In [5]:
movies = movies.merge(credits, on='title')

# Selecting relevant columns from merged dataset

In [6]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [7]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# Function to convert string representation of lists into Python lists

In [8]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

# Dropping the columns containing NaN

In [9]:
movies.dropna(inplace=True)

# Applying convert function to 'genres' and 'keywords' columns

In [10]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [11]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


# Function to extract top 3 actors from 'cast' column

In [12]:
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])

# Function to fetch directors from 'crew' column

In [13]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

# Applying fetch_director function to 'crew' column

In [14]:
movies['crew'] = movies['crew'].apply(fetch_director)

# Function to collapse lists by removing spaces

In [15]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

# Applying collapse function to 'cast', 'crew', 'genres', and 'keywords' columns

In [16]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)


In [17]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]


# Splitting 'overview' column into list of words

In [18]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combining relevant columns into 'tags' column

In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']


In [20]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."


# Dropping unnecessary columns

In [21]:
new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])


In [22]:
new.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."


# Joining elements in 'tags' column into a single string

In [23]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Using CountVectorizer to convert 'tags' column into vectorized format

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()


In [25]:
new.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


# Calculating cosine similarity matrix

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [27]:
similarity

array([[1.        , 0.08964215, 0.06071767, ..., 0.02519763, 0.0277885 ,
        0.        ],
       [0.08964215, 1.        , 0.06350006, ..., 0.02635231, 0.        ,
        0.        ],
       [0.06071767, 0.06350006, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02677398, ..., 1.        , 0.07352146,
        0.04774099],
       [0.0277885 , 0.        , 0.        , ..., 0.07352146, 1.        ,
        0.05264981],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05264981,
        1.        ]])

In [28]:
new.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


# Function to recommend movies based on similarity

In [29]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:  # Recommending top 5 similar movies
        print(new.iloc[i[0]].title)

# Example of using recommend function

In [30]:
recommend('Gandhi')

Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Ramanujan
