In [None]:
# importing needed libraries.
import pandas as pd
import sklearn  as sk
import numpy as np

In [13]:
#Loading data to datafram 'df' for data preprocessing.
data = pd.read_csv('/movie_recommendation/archive/movies.csv')

In [14]:
# Data Preprocessing
# Data Analysis

#checks the shape of Data
print("Data shape:", data.shape)

#Give column information like columns of data and respective data's dtype
print(data.info())

Data shape: (722938, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722938 entries, 0 to 722937
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    722938 non-null  int64  
 1   title                 722934 non-null  object 
 2   genres                511966 non-null  object 
 3   original_language     722938 non-null  object 
 4   overview              604283 non-null  object 
 5   popularity            722938 non-null  float64
 6   production_companies  337164 non-null  object 
 7   release_date          670278 non-null  object 
 8   budget                722938 non-null  float64
 9   revenue               722938 non-null  float64
 10  runtime               688450 non-null  float64
 11  status                722938 non-null  object 
 12  tagline               108065 non-null  object 
 13  vote_average          722938 non-null  float64
 14  vote_count            72293

In [15]:
# check null values in dataset
print(data.isnull().sum())
# checking duplicate values in data
print(data.duplicated().sum())

id                           0
title                        4
genres                  210972
original_language            0
overview                118655
popularity                   0
production_companies    385774
release_date             52660
budget                       0
revenue                      0
runtime                  34488
status                       0
tagline                 614873
vote_average                 0
vote_count                   0
credits                 225161
keywords                512699
poster_path             185385
backdrop_path           500512
recommendations         688242
dtype: int64
0


In [16]:
# Data Cleaning. As it has many missing values

# droppin unnecessary title
df = data.drop(["production_companies", "popularity", "budget", "revenue", "status", "recommendations", "runtime", "vote_average", "backdrop_path", "tagline"], axis=1)

In [17]:
#dropping duplicate values
df.drop_duplicates(inplace=True)

#checking duplicates in title
print("Duplicates in title:", df.title.duplicated().sum())
#check if duplicate titles have same release date
print("Duplicate title with same releasing date:", df[['title', 'release_date']].duplicated().sum())

#Now, removing duplicate title with same relase date
df.drop_duplicates(subset=['title', 'release_date'], inplace=True)

#Removes vote_count lower than 350
df = df[df.vote_count <= 350].reset_index()
df.isnull().sum()


Duplicates in title: 86982
Duplicate title with same releasing date: 2267


index                     0
id                        0
title                     2
genres               198452
original_language         0
overview             110858
release_date          49656
vote_count                0
credits              214140
keywords             470608
poster_path          175925
dtype: int64

In [18]:
# Replacing all null values from genre and overview with "Nothing"
df.fillna("", inplace=True)

# Deleting movies with no genre and overview
index = df[(df.genres == "") & (df.overview == "")].index
df.drop(index, inplace=True)

In [19]:
# Replacing genres, credit and keywords "-" with " "
df.genres = df.genres.apply(lambda x: " ".join(x.split("-")))
df.genres = df.credits.apply(lambda x: " ".join(x.split("-")))
df.genres = df.keywords.apply(lambda x: " ".join(x.replace(" ", "").split("-")[:5]))

In [20]:
# Making tags for prediction
df['tags'] = df.overview + " "+ df.genres + " "+ df.credits + " "+ df.keywords + " "+df.original_language

In [21]:
# Making new framework with important features
new_df = df[["id", "title", "tags", "poster_path"]]

# making all tags in lower case for better processing
new_df.tags = new_df.tags.apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.tags = new_df.tags.apply(lambda x: x.lower())


In [22]:
new_df.tags[0]

'while working underground to fix a water main brooklyn plumbers—and brothers—mario and luigi are transported down a mysterious pipe and wander into a magical new world. but when the brothers are separated mario embarks on an epic quest to find luigi. videogame plumber magicmushroom basedonvideogame aftercreditsstinger chris pratt-anya taylor-joy-charlie day-jack black-keegan-michael key-seth rogen-fred armisen-kevin michael richardson-sebastian maniscalco-khary payton-eric bauza-jessica dicicco-jeannie elias-juliet jelenic-rino romano-john dimaggio-scott menville-charles martinet-jason broad-carlos alazraqui-ashly burch-rachel butera-cathy cavadini-will collyer-django craig-willow geer-aaron hendry-andy hirsch-barbara lley-phil lamarr-jeremy maxwell-daniel mora-eric osmond-noreen reardon-lee shorten-cree summer-nisa ward-nora wyman video game-plumber-magic mushroom-based on video game-aftercreditsstinger-duringcreditsstinger-damsel in distress-brother brother relationship en'

In [23]:
# Vectorizer
# Stemming

import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [24]:
# Stem function to take text and give output

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [25]:
# applying stem funcion in tags
new_df["tags"] = new_df["tags"].apply(stem)

KeyboardInterrupt: 

In [None]:
# text vectorization
# importing count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
# setting for 5000 most repeated words, exclude stop words.
cv = CountVectorizer(stop_words="english", max_features=5000)

In [None]:
# fiting tags in count vector
vectors = cv.fit_transform(new_df["tags"]).toarray() # changes it intto array to use

In [None]:
cv.get_feature_names()[80:85]

# model building

In [None]:
#similartiy vector with cosine
from sklearn.metrics.pairwise import cosine_similarity
# calculation similarity of each movie with all movies
similarity = cosine_similarity(vector)

In [30]:
#similarity of each movie with all movies
similarity.shape

NameError: name 'similarity' is not defined

In [None]:
#model testing
#making funtion to find movie and give similar movies as return
def recommend(movies):
    movie_index = new_df[new_df.title == movies].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [None]:
#checking similar movies test 1
recommend("Batman")

In [None]:
#test 2
recommend("Black Adam")

In [None]:
# import pickle
import pickle

In [None]:
# making portable movie pickle file to trasport
pickle.dump(new_df, open("movies.pkl", "wb"))

In [None]:
#making portable movie pickle file to transport
pickle.dump(similarity, open("similarity.pkl", "wb"))

In [35]:

# first import streamlit and pickle 
import streamlit as st
import pickle

# extract the new_df dataframe from movies.pkl
movies_list = pickle.load(open("movies.pkl", "rb"))
# extract the titles of movies
movies_list_title = movies_list["title"].values

# extract the similarity which contain our cosine similarity values
similarity = pickle.load(open("similarity.pkl", "rb"))


# make a recommend function which will take movie title and return 5 similar movies with their posters
def recommend(movie):
    movie_index = movies_list[movies_list["title"] == movie].index[0]
    distances = similarity[movie_index]
    sorted_movie_list = sorted(list(enumerate(distances)), reverse=True,
                               key=lambda x:x[1])[1:6]

    recommended_movies = []
    recommended_posters = []
    for i in sorted_movie_list:
        poster_path = movies_list["poster_path"][i[0]]
        recommended_movies.append(movies_list.iloc[i[0]].title)
        recommended_posters.append("https://image.tmdb.org/t/p/original"+poster_path)

    return recommended_movies,  recommended_posters



# Create title for your stream lit page
st.title("Project Movie Recommender System")

# Create a input box for movies name 
selected_movie_name = st.selectbox(
    "What is the movie name?",
    movies_list_title
)

# create a recommend button with function of displaying recommended movies and movie posters
if st.button("Recommend"):
    recommendation, movie_posters = recommend(selected_movie_name)

    col1, col2, col3, col4, col5 = st.columns(5)

    with col1:
        st.write(recommendation[0])
        st.image(movie_posters[0])
    with col2:
        st.write(recommendation[1])
        st.image(movie_posters[1])
    with col3:
        st.write(recommendation[2])
        st.image(movie_posters[2])
    with col4:
        st.write(recommendation[3])
        st.image(movie_posters[3])
    with col5:
        st.write(recommendation[4])
        st.image(movie_posters[4])



2023-04-25 19:35:02.502 
  command:

    streamlit run C:\Users\deven\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]
