# Import

In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Data preprocessing

In [3]:
movies1 = pd.read_csv("Movies.csv")
credits = pd.read_csv("credits.csv")

movies1.head(1)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


## Merge Dataframe 

In [4]:
movies = movies1.merge(credits,on="title")
pd.set_option("display.max_columns", None)
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Create a new dataframe including only useful column

In [5]:
Movies = movies[['movie_id','title','overview','genres','keywords','cast',"crew"]]

In [6]:
Movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Remove Null Value

In [7]:
Movies.isnull().sum()
Movies.dropna(inplace=True)


In [8]:
Movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

## Create a help function for conver weird form of keyword, tagline and other to normal string

In [9]:
import ast 

def convert (text):
    l = []
    for i in ast.literal_eval(text):
        l.append(i['name'])

    return l

In [10]:

Movies["keywords"] = Movies["keywords"].apply(convert)
Movies["genres"] = Movies["genres"].apply(convert)



In [11]:
def for_cast(text):
    l = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 7:
            l.append(i["name"])
            counter +=1

    return l


In [12]:
Movies["cast"] = Movies["cast"].apply(for_cast)

In [13]:
def for_crew(text):
    l = []
    for i in ast.literal_eval(text):
        if i["job"] == "Director" or i["job"] == "Writer":
            l.append(i["name"])

    return l


In [14]:
Movies["crew"] = Movies["crew"].apply(for_crew)

In [15]:
Movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[James Cameron, James Cameron]"


## Remove Space from [genres,keywords,cast,crew]

In [16]:
def space(text):
    list = []
    for i in text:
        list.append(i.replace(" ",""))
    
    return list


In [17]:
colm = ['genres','keywords','cast','crew']

for col in colm:
    Movies[col] = Movies[col].apply(space)

In [18]:
Movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...","[JamesCameron, JamesCameron]"


## Convert List to string

In [19]:
for col in ['genres','keywords','cast','crew']:
    Movies[col] = Movies[col].apply(lambda x: " ".join(x))


## Merge all column and create one Tag 

In [20]:
Movies["tags"] = Movies["overview"]+Movies["genres"]+Movies["keywords"]+Movies["cast"]+Movies["crew"]

In [21]:
Movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy ScienceFiction,cultureclash future spacewar spacecolony socie...,SamWorthington ZoeSaldana SigourneyWeaver Step...,JamesCameron JamesCameron,"In the 22nd century, a paraplegic Marine is di..."


## Final Ready Dataset

In [22]:
df = Movies[['movie_id','title','tags']]

In [23]:
df.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."


# Vectorized

## import

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

## Apply stemmer

In [25]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [26]:
df["tags"] = df["tags"].apply(stem)

## Create vector object

In [27]:
vector = cv.fit_transform(df["tags"]).toarray()

In [28]:
np.set_printoptions(threshold=np.inf)
cv.get_feature_names_out()

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '17th', '18', '18th', '19', '1920', '1930', '1940', '1944',
       '1950', '1950s', '1960', '1960s', '1970', '1970s', '1971', '1974',
       '1976', '1980', '1985', '1990', '1999', '19th', '19thcenturi',
       '20', '200', '2003', '2009', '20th', '21st', '23', '24', '25',
       '30', '300', '3d', '40', '50', '500', '60', '70', '80', 'aaron',
       'aaroneckhart', 'aaronseltz', 'abandon', 'abbiecornish', 'abduct',
       'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'abram', 'abus',
       'academ', 'academi', 'accept', 'access', 'accid', 'accident',
       'acclaim', 'accompani', 'accomplish', 'account', 'accus', 'ace',
       'achiev', 'acquaint', 'act', 'action', 'activ', 'activist',
       'activities', 'actor', 'actress', 'actual', 'adam', 'adambrodi',
       'adamscott', 'adapt', 'add', 'addict', 'adjust', 'admir', 'admit',
       'adolesc', 'adopt', 'ador', 'adrienbrodi', 'adult', 'adulteri

## Calculate csosine_similarity

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vector)



## Sort and assign id

In [30]:
sort = sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:11]


## Main function

In [31]:
def recommend(movie):
    movie_index = df[df["title"]==movie].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(similarity[movie_index])),reverse=True,key=lambda x:x[1])[1:11]

    for i in movie_list:
        print(df.iloc[i[0]].title)

In [32]:
recommend("Avatar")

Falcon Rising
Aliens vs Predator: Requiem
Predators
Battle: Los Angeles
Independence Day
The Twilight Saga: Breaking Dawn - Part 2
Meet Dave
Lifeforce
Attack the Block
Edge of Tomorrow


# For UI

In [None]:
import joblib

joblib.dump(df,"model.pkl")

['model.pkl']