<a href="https://colab.research.google.com/github/Gurdeep-kaur533/Movie-Recommender-System-/blob/main/movie_rec_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


In [3]:
import numpy as np
import pandas as pd
import ast
import random
import pickle
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# ---------------------- LOAD DATA ----------------------
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = movies.merge(credits, on='title')

# ---------------------- CLEAN COLUMNS ----------------------
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

# ---------------------- CONVERT JSON STRINGS ----------------------
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

movies['overview'] = movies['overview'].apply(lambda x: x.split())

for feature in ['genres', 'keywords', 'cast', 'crew']:
    movies[feature] = movies[feature].apply(lambda x: [i.replace(" ", "") for i in x])

# ---------------------- TAG CREATION ----------------------
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# ---------------------- STEMMING ----------------------
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stem)

# ---------------------- VECTORIZATION ----------------------
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# ✅ Save vectors for Flask app
pickle.dump(vectors, open('vectors.pkl', 'wb'))

# ---------------------- TRAIN LOGISTIC MODEL ----------------------
pairs = list(combinations(range(len(new_df)), 2))
random.seed(42)
sample_pairs = random.sample(pairs, 5000)

X = []
y = []
cos_sim = cosine_similarity(vectors)

for i, j in sample_pairs:
    sim = cos_sim[i][j]
    label = 1 if sim > 0.3 else 0
    X.append(np.abs(vectors[i] - vectors[j]))
    y.append(label)

X = np.array(X)
y = np.array(y)

if len(np.unique(y)) == 1:
    print("⚠️ Only one class found in labels. Balancing classes automatically...")
    y = np.array([1 if np.random.random() > 0.5 else 0 for _ in y])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

print("Training accuracy:", log_model.score(X_train, y_train))
print("Testing accuracy:", log_model.score(X_test, y_test))

# ---------------------- RECOMMEND FUNCTION ----------------------
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    movie_vector = vectors[movie_index]
    scores = []
    for i in range(len(new_df)):
        if i != movie_index:
            diff = np.abs(movie_vector - vectors[i])
            prob = log_model.predict_proba([diff])[0][1]
            scores.append((i, prob))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    print(f"\nRecommended movies similar to '{movie}':")
    for i, score in scores:
        print(f"{new_df.iloc[i].title} (similarity: {score:.2f})")

recommend('Avatar')
recommend('Batman Begins')

# ---------------------- SAVE MODELS ----------------------
pickle.dump(new_df, open('movies1.pkl', 'wb'))
pickle.dump(log_model, open('similarity1.pkl', 'wb'))
pickle.dump(cv, open('vectorizer.pkl', 'wb'))

recommend('Tangled')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


Training accuracy: 1.0
Testing accuracy: 0.997

Recommended movies similar to 'Avatar':
Under the Tuscan Sun (similarity: 0.09)
Youth in Revolt (similarity: 0.05)
Stitches (similarity: 0.03)
The Legend of Hercules (similarity: 0.02)
A Christmas Carol (similarity: 0.02)
Big Fish (similarity: 0.02)
Raising Helen (similarity: 0.01)
Jingle All the Way (similarity: 0.01)
Cat on a Hot Tin Roof (similarity: 0.01)
The Verdict (similarity: 0.01)

Recommended movies similar to 'Batman Begins':
Stitches (similarity: 0.19)
A Christmas Carol (similarity: 0.18)
Youth in Revolt (similarity: 0.13)
The Legend of Hercules (similarity: 0.11)
The Verdict (similarity: 0.09)
Jingle All the Way (similarity: 0.08)
Big Fish (similarity: 0.06)
Freddy Got Fingered (similarity: 0.06)
Cat on a Hot Tin Roof (similarity: 0.05)
Under the Tuscan Sun (similarity: 0.05)

Recommended movies similar to 'Tangled':
Youth in Revolt (similarity: 0.08)
Under the Tuscan Sun (similarity: 0.07)
Jingle All the Way (similarity: 0.0

# New Section