In [2]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [3]:
df = pd.read_csv("dataset.csv")
df.head()  # Shows first few rows


Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [5]:
# Replace NaNs with empty strings
df['overview'] = df['overview'].fillna('')
df['genre'] = df['genre'].fillna('')

# Create a combined column for content-based filtering
df['combined_features'] = df['overview'] + ' ' + df['genre']
df[['combined_features']].head()


Unnamed: 0,combined_features
0,Framed in the 1940s for the double murder of h...
1,"Raj is a rich, carefree, happy-go-lucky second..."
2,"Spanning the years 1945 to 1955, a chronicle o..."
3,The true story of how businessman Oskar Schind...
4,In the continuing saga of the Corleone crime f...


In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (10000, 27965)


In [7]:
content_similarity = cosine_similarity(tfidf_matrix)
print("Cosine similarity shape:", content_similarity.shape)


Cosine similarity shape: (10000, 10000)


In [8]:
with open("content_similarity.pkl", "wb") as f:
    pickle.dump(content_similarity, f)

print("✅ content_similarity.pkl saved successfully!")


✅ content_similarity.pkl saved successfully!


In [9]:
with open("content_similarity.pkl", "rb") as f:
    test_sim = pickle.load(f)

print(test_sim.shape)


(10000, 10000)
