In [1]:
# your code here
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import zipfile

zip_file = "/content/drive/MyDrive/ML_633/ml-25m.zip"

# Create a directory for extraction (optional)
extract_path = "/content/files"
os.makedirs(extract_path, exist_ok=True)

# Extract ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files
print("Extracted files:", os.listdir(extract_path))


Extracted files: ['ml-25m']


In [5]:
import pandas as pd

movies = pd.read_csv('/content/files/ml-25m/movies.csv')
ratings = pd.read_csv('/content/files/ml-25m/ratings.csv')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
#movies.isnull().sum()
import re
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title.lower()

movies["clean_title"] = movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii 1995


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [8]:
Movies = movies.sample(frac=0.8, random_state=42)
Ratings = ratings[ratings['movieId'].isin(Movies['movieId'])]
Ratings = Ratings.iloc[:, :-1]

print("Selected Movies:", Movies.shape)
print("Filtered Ratings:", Ratings.shape)

Selected Movies: (49938, 4)
Filtered Ratings: (19502219, 3)


In [9]:
Movies['genres'] = Movies['genres'].apply(lambda x: x.split("|"))

In [10]:
from collections import Counter
genres = Counter()
for genre_list in Movies['genres']:
    genres.update(genre_list)

print(genres)
print(len(genres))

Counter({'Drama': 20411, 'Comedy': 13514, 'Thriller': 6901, 'Romance': 6201, 'Action': 5899, 'Horror': 4743, 'Documentary': 4545, 'Crime': 4261, '(no genres listed)': 4029, 'Adventure': 3280, 'Sci-Fi': 2874, 'Animation': 2360, 'Children': 2329, 'Mystery': 2328, 'Fantasy': 2196, 'War': 1493, 'Western': 1145, 'Musical': 851, 'Film-Noir': 281, 'IMAX': 164})
20


**Content-Based filtering**
Depending on titles

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

vectorize = TfidfVectorizer(ngram_range=(1,2))
tf_idf = vectorize.fit_transform(Movies['clean_title'])

def content_filter(title):
  title = clean_title(title)
  query_vec = vectorize.transform([title])
  similarity = cosine_similarity(query_vec, tf_idf).flatten()
  indices = np.argsort(similarity)[::-1]
  input_index = Movies[Movies['title'] == title].index
  indices = [i for i in indices if i not in input_index]
  indices = indices[1:6]
  #Top 5 recommendations
  return Movies.iloc[indices]

# May also make user_ID specific

display(content_filter('Toy Story 1995'))


Unnamed: 0,movieId,title,genres,clean_title
14813,78499,Toy Story 3 (2010),"[Adventure, Animation, Children, Comedy, Fanta...",toy story 3 2010
3021,3114,Toy Story 2 (1999),"[Adventure, Animation, Children, Comedy, Fantasy]",toy story 2 1999
59767,201588,Toy Story 4 (2019),"[Adventure, Animation, Children, Comedy]",toy story 4 2019
20497,106022,Toy Story of Terror (2013),"[Animation, Children, Comedy]",toy story of terror 2013
2183,2274,Lilian's Story (1995),[Drama],lilians story 1995


**Collaborative Filtering** similar users

In [12]:
movie_Id = 6
print(Movies[Movies['movieId'] == movie_Id])

def collab_filter(movie_Id):
  similar_users = Ratings[(Ratings["movieId"] == movie_Id) & (Ratings["rating"] > 4)]["userId"].unique()
  similar_user_recs = Ratings[(Ratings["userId"].isin(similar_users)) & (Ratings["rating"] > 4)]["movieId"]
  similar_user_recs = Movies[Movies['movieId'].isin(similar_user_recs)]['title']
  return similar_user_recs

display(collab_filter(movie_Id))

   movieId        title                     genres clean_title
5        6  Heat (1995)  [Action, Crime, Thriller]   heat 1995


Unnamed: 0,title
4884,Jimmy Neutron: Boy Genius (2001)
3776,"Art of War, The (2000)"
4155,"Luzhin Defence, The (2000)"
10643,Tony Takitani (2004)
31638,Four Days in October (2010)
...,...
13879,Law Abiding Citizen (2009)
11600,Red Dust (1932)
1597,Hurricane Streets (1997)
47409,Don't Hug Me I'm Scared 5 (2015)


**MF**

In [2]:
# import pandas as pd
# import numpy as np

# data = pd.merge(Ratings, Movies, on="movieId")
# from sklearn.decomposition import TruncatedSVD

# ratings_matrix = data.pivot_table(index='userId', columns='clean_title', values='rating')
# ratings_matrix = ratings_matrix.fillna(0)

# # SVD
# svd = TruncatedSVD(n_components=50)
# user_factors = svd.fit_transform(ratings_matrix)
# movie_factors = svd.components_

# predicted_ratings = np.dot(user_factors, movie_factors)
# predicted_df = pd.DataFrame(predicted_ratings, index=ratings_matrix.index, columns=ratings_matrix.columns)

# def MF_recomm(user_id, num_recommendations=5):
#     user_predictions = predicted_df.loc[user_id]
#     already_rated = ratings_matrix.loc[user_id][ratings_matrix.loc[user_id] > 0].index
#     recommendations = user_predictions.drop(already_rated).sort_values(ascending=False).head(num_recommendations)
#     return recommendations

# user_id = 1
# recommended_movies = MF_recomm(user_id)
# print(f"Top recommended movies for User {user_id}:")
# print(recommended_movies)