In [4]:
import pandas as pd
import numpy as np
df=pd.read_csv('final_features.csv')


In [2]:
df.head()

Unnamed: 0,id,title,combined_features
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [6]:
import nltk
import sklearn
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
df.iloc[0]['combined_features']

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang JamesCameron'

In [12]:
#this code block will convert each word to its root word
def stemmer(text):
    ps=PorterStemmer()
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [13]:
df["combined_features"]=df["combined_features"].apply(stemmer)

In [14]:
df.iloc[0]['combined_features']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang jamescameron'

In [19]:
#Now let's implemeent count_vectorizer to convert text to embeddings
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')
vectors=cv.fit_transform(df['combined_features']).toarray()
print(vectors.shape,type(vectors))

(4806, 5000) <class 'numpy.ndarray'>


In [21]:
#Now let's perform cosine similarity on these vectors to see how similar they are to each other
from sklearn.metrics.pairwise import cosine_similarity
similar_scores=cosine_similarity(vectors)
print(type(similar_scores.shape))

<class 'tuple'>


In [22]:
print(similar_scores[0])

[1.         0.08574929 0.08838835 ... 0.04622502 0.         0.        ]


In [25]:
#now let's create a function to recommend movies based on the similiarity scores
df[df["title"]=="John Carter"].index[0] #this will give the index of the movie to serve as test case

np.int64(4)

In [36]:
def recommend(movie_name):
    #if you can't find the movies you will get an error so let's handle it
    if not movie_name in df["title"].values:
        return "Movie not found. Please check the movie name."
    movie_index=df[df["title"]==movie_name].index[0]
    distances=similar_scores[movie_index]
    output=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    title_list=[]
    for i in output:
        title_list.append(df.iloc[i[0]]["title"])
    return title_list

In [38]:
recommend('Avatar')

['Aliens vs Predator: Requiem',
 'Independence Day',
 'Falcon Rising',
 'Titan A.E.',
 'Battle: Los Angeles']

In [39]:
#NOW we are done with notebook experiment let's now write modular coding
import pickle
with open("similarity.pkl", "wb") as f:
    pickle.dump(similar_scores, f)

In [46]:
import pymongo
import os
from dotenv import load_dotenv
load_dotenv()
DB_NAME=os.getenv("DB_NAME")
COLLECTION_NAME1=os.getenv("COLLECTION_NAME1")
COLLECTION_NAME2=os.getenv("COLLECTION_NAME2")
MONGO_URI=os.getenv("MONGODB_STRING")
print(DB_NAME,COLLECTION_NAME1,COLLECTION_NAME2,MONGO_URI)

Recommendation_system movies_details credits_details mongodb+srv://Laiwola:Laiwola08@cluster0.5kpvy7d.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0


In [47]:
movies_to_dict=pd.read_csv("tmdb_5000_movies.csv").to_dict(orient="records")
credits_to_dict=pd.read_csv("tmdb_5000_credits.csv").to_dict(orient="records")

In [48]:
def Connect_mongo(MONGO_URI,DB_NAME,COLLECTION_NAME1,COLLECTION_NAME2,movies_to_dict,credits_to_dict):
    client=pymongo.MongoClient(MONGO_URI)
    db=client[DB_NAME]
    collection1=db[COLLECTION_NAME1]
    collection2=db[COLLECTION_NAME2]
    collection1.insert_many(movies_to_dict)
    collection2.insert_many(credits_to_dict)
    return collection1,collection2

In [49]:
Connect_mongo(MONGO_URI,DB_NAME,COLLECTION_NAME1,COLLECTION_NAME2,movies_to_dict,credits_to_dict)

(Collection(Database(MongoClient(host=['ac-dfv5ivj-shard-00-00.5kpvy7d.mongodb.net:27017', 'ac-dfv5ivj-shard-00-01.5kpvy7d.mongodb.net:27017', 'ac-dfv5ivj-shard-00-02.5kpvy7d.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster0', authsource='admin', replicaset='atlas-p7z59a-shard-0', tls=True), 'Recommendation_system'), 'movies_details'),
 Collection(Database(MongoClient(host=['ac-dfv5ivj-shard-00-00.5kpvy7d.mongodb.net:27017', 'ac-dfv5ivj-shard-00-01.5kpvy7d.mongodb.net:27017', 'ac-dfv5ivj-shard-00-02.5kpvy7d.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster0', authsource='admin', replicaset='atlas-p7z59a-shard-0', tls=True), 'Recommendation_system'), 'credits_details'))