**1) Data Ingestion and** **Preprocessing**

In [2]:
import pandas as pd

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")

print("Movies Shape:", movies.shape)
print("Ratings Shape:", ratings.shape)
print("Tags Shape:", tags.shape)
print("Links Shape:", links.shape)

print("\nMovies Sample:\n", movies.head())
print("\nRatings Sample:\n", ratings.head())
print("\nTags Sample:\n", tags.head())
print("\nLinks Sample:\n", links.head())


Movies Shape: (9742, 3)
Ratings Shape: (100836, 4)
Tags Shape: (3683, 4)
Links Shape: (9742, 3)

Movies Sample:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Sample:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Tags Sample:
    userId  movieId             

In [3]:

print("\nMissing Values:\n")
print("Movies:\n", movies.isnull().sum())
print("Ratings:\n", ratings.isnull().sum())
print("Tags:\n", tags.isnull().sum())
print("Links:\n", links.isnull().sum())



Missing Values:

Movies:
 movieId    0
title      0
genres     0
dtype: int64
Ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Tags:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
Links:
 movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [4]:

movies['genres'] = movies['genres'].fillna('(no genres listed)')

tags.dropna(inplace=True)

movies['movieId'] = movies['movieId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
tags['movieId'] = tags['movieId'].astype(int)
links['movieId'] = links['movieId'].astype(int)

movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
links.drop_duplicates(inplace=True)


In [5]:

ratings_movies = pd.merge(ratings, movies, on='movieId', how='left')

ratings_movies_tags = pd.merge(ratings_movies, tags[['userId', 'movieId', 'tag']], on=['userId', 'movieId'], how='left')


ratings_movies_full = pd.merge(ratings_movies_tags, links, on='movieId', how='left')


In [6]:

print(ratings_movies_full.isnull().sum())

print(ratings_movies_full.head())


userId           0
movieId          0
rating           0
timestamp        0
title            0
genres           0
tag          99201
imdbId           0
tmdbId          13
dtype: int64
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  tag  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  NaN  114709    862.0  
1                               Comedy|Romance  NaN  113228  15602.0  
2                        Action|Crime|Thriller  NaN  113277    949.0  
3                             Mystery|Thriller  NaN  114369    807.0  
4                       Crime|Mystery|Th

In [11]:

cleaned_data = ratings_movies_full.drop(columns=['timestamp', 'tag', 'imdbId', 'tmdbId'])

cleaned_data.to_csv("cleaned_movies_data.csv", index=False)

print(cleaned_data.head())


   userId  movieId  rating                        title  \
0       1        1     4.0             Toy Story (1995)   
1       1        3     4.0      Grumpier Old Men (1995)   
2       1        6     4.0                  Heat (1995)   
3       1       47     5.0  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [12]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")


merged_df = ratings.merge(movies, on='movieId', how='left')
merged_df = merged_df.merge(tags[['userId', 'movieId', 'tag']], on=['userId', 'movieId'], how='left')
merged_df = merged_df.merge(links, on='movieId', how='left')

cleaned_data = merged_df[['userId', 'movieId', 'rating', 'title', 'genres']].copy()

cleaned_data.dropna(subset=['userId', 'movieId', 'rating', 'title', 'genres'], inplace=True)

cleaned_data.head()


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


**2) Content-Based Filtering** **Module**

In [15]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

# تجهيز البيانات
movies_df = cleaned_data.copy()
movies_df['genres'] = movies_df['genres'].str.replace('|', ' ', regex=False)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# نستخدم NearestNeighbors مع مقياس cosine
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

def recommend_movies(title, df=movies_df, tfidf_matrix=tfidf_matrix, nn=nn, top_n=10):
    if title not in df['title'].values:
        return f"❌ '{title}' not found in the dataset."

    idx = df.index[df['title'] == title][0]
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=top_n+1)
    
    rec_indices = indices.flatten()[1:]  # استبعد الأول (هو نفس الفيلم)
    return df.iloc[rec_indices][['title', 'genres']]

print("🔍 Movie recommendations similar to 'Heat (1995)':\n")
print(recommend_movies("Heat (1995)"))


🔍 Movie recommendations similar to 'Heat (1995)':

                                    title                 genres
25958        Bourne Ultimatum, The (2007)  Action Crime Thriller
31997                       Batman (1989)  Action Crime Thriller
25979                Bank Job, The (2008)  Action Crime Thriller
70888               Equalizer, The (2014)  Action Crime Thriller
102594                   Furious 7 (2015)  Action Crime Thriller
19172   Die Hard: With a Vengeance (1995)  Action Crime Thriller
48978                  Dirty Harry (1971)  Action Crime Thriller
44965   Die Hard: With a Vengeance (1995)  Action Crime Thriller
50875               Dead Pool, The (1988)  Action Crime Thriller
36018   Die Hard: With a Vengeance (1995)  Action Crime Thriller
