## 10. Combining Multiple Datasets in the Movie Lens Dataset
    - Task: Combine and preprocess multiple related datasets from the Movie Lens dataset, such as ratings, user information, and movie metadata.
    - Dataset: Movie Lens Dataset


## Loading and Displaying datasets

In [36]:
import pandas as pd

In [37]:

# Load datasets
ratings = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/ratings.csv')
tags = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/tags.csv')
movies = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/movies.csv')
links = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/links.csv')

# Display the first few rows of each dataset
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [38]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [39]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## Merge the Datasets

In [41]:
# Merge ratings with movies
ratings_movies = pd.merge(ratings, movies, on='movieId', how='inner')

# Display the combined dataframe
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


## Merge the Resulting DataFrame with Tags

In [42]:
# Merge the resulting dataframe with tags
ratings_movies_tags = pd.merge(ratings_movies, tags, on=['userId', 'movieId'], how='left')

# Display the combined dataframe
ratings_movies_tags.head()


Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,,


## Merge the Resulting DataFrame with Links

In [43]:
# Merge the resulting dataframe with links
final_df = pd.merge(ratings_movies_tags, links, on='movieId', how='left')

# Display the combined dataframe
final_df.head()


Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,,113228,15602.0
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,,113277,949.0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,,,114369,807.0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,,,114814,629.0


## Preprocess the Combined Dataset

In [44]:
# Check for missing values
print(final_df.isnull().sum())

userId             0
movieId            0
rating             0
timestamp_x        0
title              0
genres             0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId            13
dtype: int64


In [45]:
# Drop or fill missing values
final_df = final_df.dropna(subset=['tag'])  

In [46]:
# Check for missing values
print(final_df.isnull().sum())

userId         0
movieId        0
rating         0
timestamp_x    0
title          0
genres         0
tag            0
timestamp_y    0
imdbId         0
tmdbId         0
dtype: int64


## Convert Data type

In [47]:
# Convert timestamp to datetime
final_df['timestamp_x'] = pd.to_datetime(final_df['timestamp_x'], unit='s')

# Display data types
final_df.dtypes


userId                  int64
movieId                 int64
rating                float64
timestamp_x    datetime64[ns]
title                  object
genres                 object
tag                    object
timestamp_y           float64
imdbId                  int64
tmdbId                float64
dtype: object

## Encode Categorical Variables


In [48]:
# One-hot encode genres
genres_dummies = final_df['genres'].str.get_dummies(sep='|')
final_df = pd.concat([final_df, genres_dummies], axis=1)


In [49]:
# Drop the original genres column
final_df = final_df.drop('genres', axis=1)

##  Feature Engineering

In [50]:
# Example: Average rating per user
final_df['user_avg_rating'] = final_df.groupby('userId')['rating'].transform('mean')

# Example: Average rating per movie
final_df['movie_avg_rating'] = final_df.groupby('movieId')['rating'].transform('mean')


## Summary and Save the Combined Dataset

In [51]:
# Save the combined dataset
final_df.to_csv('final_movie_lens_preprocessed.csv', index=False)

# Display the first few rows of the final dataframe
final_df.head()


Unnamed: 0,userId,movieId,rating,timestamp_x,title,tag,timestamp_y,imdbId,tmdbId,(no genres listed),...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_avg_rating,movie_avg_rating
241,2,60756,5.0,2015-10-24 19:29:40,Step Brothers (2008),funny,1445715000.0,838283,12133.0,0,...,0,0,0,0,0,0,0,0,5.0,4.1875
242,2,60756,5.0,2015-10-24 19:29:40,Step Brothers (2008),Highly quotable,1445715000.0,838283,12133.0,0,...,0,0,0,0,0,0,0,0,5.0,4.1875
243,2,60756,5.0,2015-10-24 19:29:40,Step Brothers (2008),will ferrell,1445715000.0,838283,12133.0,0,...,0,0,0,0,0,0,0,0,5.0,4.1875
252,2,89774,5.0,2015-10-24 19:33:09,Warrior (2011),Boxing story,1445715000.0,1291584,59440.0,0,...,0,0,0,0,0,0,0,0,5.0,5.0
253,2,89774,5.0,2015-10-24 19:33:09,Warrior (2011),MMA,1445715000.0,1291584,59440.0,0,...,0,0,0,0,0,0,0,0,5.0,5.0
