# Data Preprocessing - MovieLens Dataset

A Basic Analysis of MovieLens Dataset[Small]

In [23]:
import numpy as np
import pandas as pd

## Movie Data

In [24]:
movies = pd.read_csv("ml-latest-small/movies.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [25]:
movies["movieId"].unique()

array([     1,      2,      3, ..., 193585, 193587, 193609])

### Genres Extraction

In [26]:
movies["genres"]

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

Replace *(no genres listed)* with *Unknown*

In [27]:
movies["genres"] = movies["genres"].apply(lambda x: "Unknown" if x == "(no genres listed)" else x)

In [28]:
genres = set()

def add_to_genre_set(genreString: str):
    genreList = genreString.split("|")
    genres.update(set(genreList))
    return True

movies["genres"].apply(add_to_genre_set)

0       True
1       True
2       True
3       True
4       True
        ... 
9737    True
9738    True
9739    True
9740    True
9741    True
Name: genres, Length: 9742, dtype: bool

In [29]:
genres = list(genres)
print(genres, "\nLength:", len(genres))

['War', 'Thriller', 'Sci-Fi', 'Documentary', 'Romance', 'Horror', 'Western', 'Film-Noir', 'Adventure', 'Action', 'Musical', 'Mystery', 'Crime', 'Fantasy', 'Unknown', 'IMAX', 'Comedy', 'Animation', 'Children', 'Drama'] 
Length: 20


In [30]:
genres_file_format = [genre + "\n" for genre in genres]

with open("../training_data/genres.txt", "w") as file:
    file.writelines(genres_file_format)

## Ratings Data

In [31]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [32]:
user_ids = ratings["userId"].unique()

### User Genre Ratings Mapping

We'll be calculating the average of ratings given by each user in each genre.

In [33]:
ugr_dim = (len(user_ids), len(genres))
ratings_arr = np.zeros(ugr_dim)

In [34]:
def fetch_movie_row(movie, user_ratings, ratings_row, ratings_count):
    movie_id = movie["movieId"]
    movie_genres = movie["genres"].split("|")
    rating_row = user_ratings.loc[user_ratings["movieId"] == movie_id]
    curr_rating = rating_row["rating"]
    for genre in movie_genres:
        idx = genres.index(genre)
        ratings_row[idx] += curr_rating
        ratings_count[idx] += 1

def calculate_user_rows(uid, ratings_arr):
    user_ratings = ratings.loc[ratings["userId"] == uid]
    rated_movies = movies.loc[movies["movieId"].isin(user_ratings["movieId"])]
    ratings_arr_row = ratings_arr[uid-1]
    ratings_count_arr = np.zeros(ugr_dim[1])
    rated_movies.apply(lambda x: fetch_movie_row(x, user_ratings, ratings_arr_row, ratings_count_arr), axis=1)
    # print("Ratings Row:", ratings_arr_row)
    # print("Ratings Count:", ratings_count_arr)
    result = np.divide(ratings_arr_row, ratings_count_arr, out=np.zeros(ugr_dim[1]), where=ratings_count_arr!=0)
    # print("Calculation:", result)
    ratings_arr[uid-1] = result

In [35]:
for uid in user_ids:
    calculate_user_rows(uid, ratings_arr)

  ratings_row[idx] += curr_rating


In [36]:
ratings_arr = np.round(ratings_arr, 2)

In [37]:
ugr_data = pd.DataFrame(ratings_arr, columns=genres)
ugr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610 entries, 0 to 609
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   War          610 non-null    float64
 1   Thriller     610 non-null    float64
 2   Sci-Fi       610 non-null    float64
 3   Documentary  610 non-null    float64
 4   Romance      610 non-null    float64
 5   Horror       610 non-null    float64
 6   Western      610 non-null    float64
 7   Film-Noir    610 non-null    float64
 8   Adventure    610 non-null    float64
 9   Action       610 non-null    float64
 10  Musical      610 non-null    float64
 11  Mystery      610 non-null    float64
 12  Crime        610 non-null    float64
 13  Fantasy      610 non-null    float64
 14  Unknown      610 non-null    float64
 15  IMAX         610 non-null    float64
 16  Comedy       610 non-null    float64
 17  Animation    610 non-null    float64
 18  Children     610 non-null    float64
 19  Drama   

In [38]:
ugr_data.describe()

Unnamed: 0,War,Thriller,Sci-Fi,Documentary,Romance,Horror,Western,Film-Noir,Adventure,Action,Musical,Mystery,Crime,Fantasy,Unknown,IMAX,Comedy,Animation,Children,Drama
count,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0
mean,3.49623,3.618311,3.503443,1.381787,3.614951,3.030459,2.507934,1.508262,3.593754,3.54477,2.765885,3.574508,3.714738,3.412508,0.154098,2.857672,3.562049,3.144607,3.198016,3.756738
std,1.330698,0.625678,0.751212,1.893903,0.691478,1.38489,1.814156,1.943701,0.661736,0.631665,1.66369,1.06505,0.744412,1.003291,0.756035,1.772597,0.564129,1.450404,1.239336,0.531123
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
25%,3.315,3.31,3.2,0.0,3.25,2.67,0.0,0.0,3.28,3.24,2.0,3.265,3.415,3.0925,0.0,0.5,3.25,3.0,3.0,3.46
50%,3.895,3.67,3.59,0.0,3.685,3.43,3.25,0.0,3.65,3.6,3.4,3.75,3.8,3.555,0.0,3.535,3.59,3.615,3.5,3.82
75%,4.27,4.0,4.0,3.5,4.05,4.0,4.0,3.75,4.0,3.95,4.0,4.1975,4.17,4.0,0.0,4.0,3.92,4.0,4.0,4.1075
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


## Movie Data Encoding

In [39]:
md_cols = ["movieID", "year", "average_rating"]
md_cols.extend(genres)

md_dim = (len(movies), len(md_cols))

In [40]:
def extract_year_from_title(title: str):
    title = title.strip()
    if "(" not in title:
        return -1
    return int(title[-5:-1])

def process_average_ratings(movie_id: int):
    movie_ratings = ratings.loc[ratings["movieId"] == movie_id]
    avg_rating = movie_ratings["rating"].mean()
    return np.round(np.float64(0), 2) if np.isnan(avg_rating) else np.round(avg_rating, 2)

def encode_genres(movie_row, md_arr):
    movie_genres = movie_row["genres"].split("|")
    for genre in movie_genres:
        idx = md_cols.index(genre)
        md_arr[idx] = 1

# extract_year_from_title(movies["title"][0])
# extract_year_from_title(movies[movies["movieId"] == 40697]["title"].values[0])
# process_average_ratings(1076)
# process_average_ratings(2)

In [41]:
movie_data_arr = np.zeros(md_dim)

for idx, movie_row in movies.iterrows():
    md_arr = movie_data_arr[idx]
    movie_id = movie_row["movieId"]
    md_arr[0] = movie_id
    md_arr[1] = extract_year_from_title(movie_row["title"])
    md_arr[2] = process_average_ratings(movie_id)
    encode_genres(movie_row, md_arr)

In [42]:
movies_data_df = pd.DataFrame(movie_data_arr, columns=md_cols)
movies_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movieID         9742 non-null   float64
 1   year            9742 non-null   float64
 2   average_rating  9742 non-null   float64
 3   War             9742 non-null   float64
 4   Thriller        9742 non-null   float64
 5   Sci-Fi          9742 non-null   float64
 6   Documentary     9742 non-null   float64
 7   Romance         9742 non-null   float64
 8   Horror          9742 non-null   float64
 9   Western         9742 non-null   float64
 10  Film-Noir       9742 non-null   float64
 11  Adventure       9742 non-null   float64
 12  Action          9742 non-null   float64
 13  Musical         9742 non-null   float64
 14  Mystery         9742 non-null   float64
 15  Crime           9742 non-null   float64
 16  Fantasy         9742 non-null   float64
 17  Unknown         9742 non-null   f

In [43]:
movies_data_df.to_csv("../training_data/movies_encoded_data.csv", index=False)

## Process Train Data

In [44]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [45]:
x_users_dim = (len(ratings), len(genres))
x_movies_dim = (len(ratings), len(md_cols))

train_user_data = np.zeros(x_users_dim)
train_movie_data = np.zeros(x_movies_dim)
train_ratings = []

In [46]:
def process_training_data(idx, ratings_row):
    # print(idx, ugr_data.loc[ratings_row["userId"]-1])
    ## Add User Data
    user_row = ugr_data.loc[ratings_row["userId"]-1]
    train_user_data[idx] = user_row

    ## Add Movie Data
    mr_row = movies_data_df.loc[movies_data_df["movieID"] == ratings_row["movieId"]]
    train_movie_data[idx] = mr_row

    ## Add Y data [Ratings]
    train_ratings.append(ratings_row["rating"])

# process_training_data(ratings.loc[0])

In [47]:
for idx, ratings_row in ratings.iterrows():
    process_training_data(idx, ratings_row)

In [48]:
print("Training Data:\n\tUsers:", len(train_user_data), "\n\tMovies:", len(train_movie_data), "\n\tTarget(Ratings):", len(train_ratings))

Training Data:
	Users: 100836 
	Movies: 100836 
	Target(Ratings): 100836


In [49]:
x_users_df = pd.DataFrame(train_user_data, columns=genres)
x_movies_df = pd.DataFrame(train_movie_data, columns=md_cols)
y_series = pd.Series(train_ratings)

In [50]:
x_movies_df.loc[:5]

Unnamed: 0,movieID,year,average_rating,War,Thriller,Sci-Fi,Documentary,Romance,Horror,Western,...,Musical,Mystery,Crime,Fantasy,Unknown,IMAX,Comedy,Animation,Children,Drama
0,1.0,1995.0,3.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
1,3.0,1995.0,3.26,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,6.0,1995.0,3.95,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,47.0,1995.0,3.98,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.0,1995.0,4.24,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,70.0,1996.0,3.51,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [51]:
y_series[:5]

0    4.0
1    4.0
2    4.0
3    5.0
4    5.0
dtype: float64

In [52]:
x_movies_df.to_csv("../training_data/x_movies_data.csv", index=False)
x_users_df.to_csv("../training_data/x_users_data.csv", index=False)
y_series.to_csv("../training_data/y_data.csv", index=False, header=["Ratings"])