In [70]:
import pandas as pd
import numpy as np
import re, os, pickle

In [71]:
movies_df = pd.read_csv('../datasets/movies.csv')
# I used only 2 million items from ratings due to the lack of computer power
ratings_df = pd.read_csv('../datasets/ratings.csv').iloc[:2000000, :]  # Change the "2000000" to your desired size
tags_df = pd.read_csv('../datasets/tags.csv')

In [72]:
# Moving the data out of ratings_df and tags_df for the last movie the user liked to be used as the label:
# Rating of 4+ = liked
# Creating directories:
if os.path.exists('data/') != True:
    os.mkdir('data/')
    
# if os.path.exists('data/last_liked_tags/') != True: 
#     os.mkdir('data/last_liked_tags/')
    
# Getting the last movie liked from ratings_df:
ratings_df_copy = ratings_df.copy()
tags_df_copy = tags_df.copy()

users_list = list(set(ratings_df_copy.userId)) # List of all users in the dataset

ratings_index_list = [] # These empty lists will be used to remove the last liked movies from the ratings_df and tags_df_mod copies
tags_index_list = []

last_ratings_df = pd.DataFrame() # Want to save all the last liked movies rated into a single CSV file

counter = 0

for user in users_list:
    try: # Some users did not rate a movie highly enough and will be removed from the dataset
        temp_df = ratings_df_copy[ratings_df_copy.userId == user].copy()
        temp_df = temp_df[temp_df.rating >= 4] # = Liked Movie

        last_time = max(temp_df.timestamp) # If the user did not have a "liked" movie, this will return an error

        temp_df = temp_df[temp_df.timestamp == last_time] # Isolating the last liked movie rated for each user
        
        if len(temp_df) > 1: # Some of the movies were rated at the same timestamp; only the last one on spliced DF will be removed
            temp_df = temp_df.iloc[[len(temp_df)-1]]
            
        ratings_index_list.append(temp_df.index.values[0]) # Appending the index of the last movies watched

        if counter == 0:
            last_ratings_df = temp_df
            counter = 1

        else:
            last_ratings_df = pd.concat([last_ratings_df, temp_df], ignore_index= True)
        
    except Exception:
        ratings_index_list.append(ratings_df_copy[ratings_df_copy.userId == user].index.values[0]) # Adding the index of the users whom did not highly rate a movie
    
    try:  # Some users have not created tags
        temp_df = tags_df_copy[tags_df_copy.userId == user].copy()
        temp_df = temp_df[temp_df.rating >= 4]
        last_movie = temp_df.movieId.values[0]
        temp_df = temp_df[temp_df.movieId == last_movie]
        
        
        if len(temp_df) == 0: # most users did not create tag(s) for the last movie liked!
            continue
            
        else:
            temp_df.to_csv('data/last_liked_tags/' + str(user) + '.csv', index = False)  #!! these tags will not be used and is stored for examination purposes;
            # these must be removed for "proper" datasets when training to exclude data related to the label from being used in the training data
            tags_index_list.extend(list(temp_df.index.values))  # This is a .extend since there are most likely more than one timestamp per movie
    
    except Exception:
        pass
    
last_ratings_df.to_csv('data/last_liked_ratings.csv', index = False)

# Removing the last movies from ratings_df_copy and tags_df_copy:
ratings_df_removed = ratings_df_copy.drop(ratings_index_list)

tags_df_removed = tags_df_copy.drop(tags_index_list)

ratings_df_removed.to_csv('data/ratings_df_last_liked_movie_removed.csv', index = False)
tags_df_removed.to_csv('data/tags_df_last_liked_movie_removed.csv', index = False)

In [73]:
# Deleting 'timestamp' column since it is not informative (not examining viewer's social behaviors)
# Droping all NaN values, which is only seen in the tag column--this is to eliminate considering NaNs when looping below
tags_df_removed = pd.read_csv('data/tags_df_last_liked_movie_removed.csv')

tags_df_mod = tags_df_removed.copy().drop('timestamp', axis=1).dropna()
tags_df_mod['tag'] = tags_df_mod['tag'].str.lower()# Making all tags lowercased for uniform format


for index, row in tags_df_mod.iterrows():
    tag = row.tag#.split()  # splitting words for spell check (if need)
    
    correct_tag = re.sub(r' \([^)]*\)', '', tag)  # Removing all parenthesis and its contents, including the whitespace before
        
    # First if:
    if 'based' in correct_tag: # This is necessary because it is a common tag and avoids the other if statements downstream
        tags_df_mod.loc[index, 'tag'] = correct_tag
        continue
        
    # Second if:    
    if '-' in correct_tag: # This is to keep "sci-fi" from being removed in the next if statement
        tags_df_mod.loc[index, 'tag'] = correct_tag
        continue
        
    # Third if:    
    if re.findall(r'\b\w{2}\b', correct_tag):
        tags_df_mod.loc[index, 'tag'] = np.NaN # Replacing two-letter words; Need to maintain index ordering, will delete NaNs later
        
    elif re.findall(r'\b\w{1}\b', correct_tag):
        tags_df_mod.loc[index, 'tag'] = np.NaN # Replacing one-letter words
        
    elif tag == correct_tag: # This is for better performance since replacing significantly slows the process
        continue
        
    else:
        tags_df_mod.loc[index, 'tag'] = correct_tag # Saves the corrected tag
        pass
        
tags_df_mod = tags_df_mod.dropna() # Dropping all tags with words that are lower than two letters or less

tags_df_mod.to_csv('data/tags_df_mod.csv', index = False)

In [74]:
# Creating a new DF that contains the most common tags for each movie ("movieId"):

# This will create a new DF for each movie and will store this file since there is no easy storage method for this task
# Storage will be in the "data" folder under the "movie_tags" subfolder:
    
if os.path.exists('data/movie_tags/') != True: # Creating movie_tags subfolder
    os.mkdir('data/movie_tags/')
    
# Creating a copy of tags_df_mod and dropping userID:
tags_df_mod = pd.read_csv('data/tags_df_mod.csv')

tags_df_no_user = tags_df_mod.copy().drop('userId', axis= 1)

# Obtaining a list of all movieId with tags:
# !! The set() function does not put the list in perfect order. Some of the IDs are out-of-place.
movieId_list = list(set(tags_df_no_user.movieId))  

for movieId in movieId_list:
    df_select = tags_df_no_user[tags_df_no_user.movieId == movieId].copy().drop('movieId', axis= 1)
    
    df_select['COUNT'] = 1
    
    df_select_group = df_select.groupby(['tag']).count()
    
    df_select_group = df_select_group.sort_values(by=['COUNT'], ascending= False).reset_index()
    
    df_select_group.to_csv('data/movie_tags/' + str(movieId) + '.csv', index = False)

In [75]:
# Creating a new DF that contains the most common tags for each user ("userId"):
# This DF is similar to the movieId DF that is previously created except this ties the tags in with each user
# This can be used in conjunction with the most common genres watched by the user to help determine which movies they like to watch
if os.path.exists('data/user_tags/') != True: # Creating movie_tags subfolder
    os.mkdir('data/user_tags/')
    
# Creating a copy of tags_df_mod and dropping userID:
tags_df_mod = pd.read_csv('data/tags_df_mod.csv')

tags_df_user = tags_df_mod.copy().drop('movieId', axis= 1)

# Obtaining a list of all movieId with tags:
userId_list = list(set(tags_df_user.userId))

for userId in userId_list:
    df_select = tags_df_user[tags_df_user.userId == userId].copy().drop('userId', axis= 1)
    
    df_select['COUNT'] = 1
    
    df_select_group = df_select.groupby(['tag']).count()
    
    df_select_group = df_select_group.sort_values(by=['COUNT'], ascending= False).reset_index()
    
    df_select_group.to_csv('data/user_tags/' + str(userId) + '.csv', index = False)

In [76]:
# Creating another DF that contains the most common tags created by users:
# Common = the tag was used 35 times or more
tags_df_mod = pd.read_csv('data/tags_df_mod.csv')

common_tags_df = tags_df_mod.groupby(['tag']).count().sort_values('userId', ascending= False).copy().drop('movieId', axis= 1).reset_index()

common_tags_df = common_tags_df[common_tags_df.userId >= 35]

common_tags_df.to_csv('data/common_tags.csv', index = False)

In [77]:
ratings_df_removed = pd.read_csv('data/ratings_df_last_liked_movie_removed.csv')
movies_df_mod = movies_df.copy()

movies_df_mod['YEAR'] = 0
movies_df_mod['UPPER_STD'] = 0
movies_df_mod['LOWER_STD'] = 0
movies_df_mod['AVG_RATING'] = 0
movies_df_mod['VIEW_COUNT'] = 0

# Making the genres into columns:
# First, need to obtain a list of all the genres in the dataset.
# !! Note: "IMAX" is not listed in the readme but is present in the dataset.
genres_list = []
for index, row in movies_df.iterrows():
    try:
        genres = row.genres.split('|')
        genres_list.extend(genres)
    except:
        genres_list.append(row.genres)
        
genres_list = list(set(genres_list))
genres_list.remove('IMAX')
genres_list.remove('(no genres listed)') # Replace with 'None'
genres_list.append('None')

for genre in genres_list: # Creating new columns with names as genres
    movies_df_mod[genre] = 0  # 0 = movie is not considered in that genre


for index, row in movies_df_mod.iterrows():
    movieId = row.movieId
    title = row.title
    
    try:
        genres = row.genres.split('|') # Multiple genres for the movie is separated by '|' in the one string; converts to list
    except Exception:
        genres = list(row.genres) # In the case that there is only one genre for the movie
        
    
    # Extracting the year from the title:
    try: # Some titles do not have the year--these will be removed downstream to remove the need to access the IMDB API (http://www.omdbapi.com/)
        matcher = re.compile('\(\d{4}\)')  # Need to extract '(year)' from the title in case there is a year in the title
        parenthesis_year = matcher.search(title).group(0)
        matcher = re.compile('\d{4}') # Matching the year from the already matched '(year)'
        year = matcher.search(parenthesis_year).group(0)

        movies_df_mod.loc[index, 'YEAR'] = int(year)
    
    except Exception:
        pass
    
    # Merging info from ratings_df into movies_df
    try:
        ratings_df_select = ratings_df_removed[ratings_df_removed.movieId == movieId]  # Gathering the reviews for the movies
        std = np.std(ratings_df_select.rating)
        average_rating = np.mean(ratings_df_select.rating)

        upper_std = average_rating + std

        if upper_std > 5:   # This is to prevent the upper range from passing the max rating value
            upper_std = 5

        lower_std = average_rating - std

        if lower_std < 0.5:
            lower_std = 0.5

        view_count = len(ratings_df_select)

        movies_df_mod.loc[index, 'UPPER_STD'] = upper_std
        movies_df_mod.loc[index, 'LOWER_STD'] = lower_std
        movies_df_mod.loc[index, 'AVG_RATING'] = average_rating
        movies_df_mod.loc[index, 'VIEW_COUNT'] = view_count
        
    except Exception:
        pass

    
    # Changing all columns that are labelled as genres to 1 if the movie is in that genre:
    if 'IMAX' in genres:
        genres.remove('IMAX')
        
    if '(no genres listed)' in genres:
        genres.remove('(no genres listed)')
        genres.append('None')
        
    for genre in genres:
        movies_df_mod.loc[index, genre] = 1
        
movies_df_mod = movies_df_mod[movies_df_mod.YEAR != 0] # Removing all movies without years in the title
movies_df_mod = movies_df_mod[movies_df_mod.VIEW_COUNT != 0] # Removing all movies than have not be rated

movies_df_mod.to_csv('data/movies_mod.csv', index = False)

In [78]:
# Combining ratings_df and movies_df_mod together:
movies_df_mod = pd.read_csv('data/movies_mod.csv')

ratings_df_removed = pd.read_csv('data/ratings_df_last_liked_movie_removed.csv')

ratings_movies_df = ratings_df_removed.merge(movies_df_mod, how= 'left', on= 'movieId').dropna()  # Some of the movies were removed when creating movies_df_mod, which will result in nan values for some rows

# Getting a count of all the liked and dislike genres and transforming it into a percentage (liked genre counts / all liked genres counts)
# If the user rated the movie 4+, then they liked it. If lower than 4, then they disliked it.
users_list = list(set(ratings_movies_df.userId))

total_user_like_df = pd.DataFrame()
total_user_dislike_df = pd.DataFrame()

progress_counter_1 = 0
progress_counter_2 = .10

for user in users_list:
    temp_df = ratings_movies_df[ratings_movies_df.userId == user]
    like_df = temp_df[temp_df.rating >= 4].iloc[:, 14:] # Only selecting the genres
    dislike_df = temp_df[temp_df.rating < 4].iloc[:, 14:]
    
    liked_total_counts = 0
    liked_dict = {'userId': user,'War': 0, 'Animation': 0, 'Horror': 0, 'Sci-Fi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0, 
                  'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'Film-Noir': 0, 'Drama': 0, 
                  'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}
    
    disliked_total_counts = 0
    disliked_dict = {'userId': user,'War': 0, 'Animation': 0, 'Horror': 0, 'Sci-Fi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0, 
                  'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'Film-Noir': 0, 'Drama': 0, 
                  'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}   
    
    progress_counter_1 += 1
    if progress_counter_1 / len(users_list) >= progress_counter_2:
        print(progress_counter_1 / len(users_list) * 100, '%')
        progress_counter_2 += .10
    
    for genre in list(like_df.columns): # Getting all the genre counts for liked and disliked, separately
        if len(like_df) == 0: # If the user has not given a movie a rating of 4 or higher
            pass
        
        else:
            liked_total_counts += sum(like_df[genre])
        
        
        if len(dislike_df) == 0: # If the user has not given a movie a rating of 3.5 or lower
            pass
        
        else:
            disliked_total_counts += sum(dislike_df[genre])
        
        
    for genre in list(like_df.columns):
        if liked_total_counts == 0: 
            pass
        
        else:
            liked_genre_total_counts = sum(like_df[genre])
            liked_dict[genre] = liked_genre_total_counts/liked_total_counts
            
            
        if disliked_total_counts == 0:
            pass
        
        else:
            disliked_genre_total_counts = sum(dislike_df[genre])
            disliked_dict[genre] = disliked_genre_total_counts/disliked_total_counts
        
    
    user_like_df = pd.DataFrame(liked_dict, index=[0]) # Even though some users have not rated a movie higher or lower than 4, the zero counts will still be added for complete-ness
    user_dislike_df = pd.DataFrame(disliked_dict, index=[0])
    
    # Concatenating the user total counts 
    if len(total_user_like_df) == 0:
        total_user_like_df = user_like_df
    
    else:
        total_user_like_df = pd.concat([total_user_like_df, user_like_df], ignore_index= True)
        
    if len(total_user_dislike_df) == 0:
        total_user_dislike_df = user_dislike_df
        
    else:
        total_user_dislike_df = pd.concat([total_user_dislike_df, user_dislike_df], ignore_index= True)
        
total_user_like_df.to_csv('data/total_user_like_df.csv', index = False)
total_user_dislike_df.to_csv('data/total_user_dislike_df.csv', index = False)
        
#####################
# The reason why the counts are in percentage is so that the counts/genres are scaled against each other rather than a raw count
# This is more important for the models since someone who rated a lot of movies vs someone who rated a few movies would have higher counts
# but the higher counts is not meaningful and will most likely skew the model weights 

10.001492760113448 %
20.002985520226897 %
30.00447828034035 %
40.00099517340897 %
50.002487933522424 %
60.00398069363586 %
70.00049758670448 %
80.00199034681793 %
90.00348310693138 %
100.0 %


In [79]:
# ! This cell will most likely take over 2 days or more on a personal system or laptop
# ! A premade CSV and pickle files are already included in the GitHub file (like_dislike_tags.csv) if not wanting to wait and the original dataset is the "MovieLens 25M Dataset".
# ! Else, remove the '#' before running this cell.

# Creating a dictionary of vectorized tags:
if os.path.exists('data/final/') != True:
    os.mkdir('data/final/')


# common_tags = pd.read_csv('data/common_tags.csv', index_col= False)

# tags = list(set(common_tags.tag))

# vector_counter = 0
# vectorized_dict = {}

# for tag in tags:
#     vectorized_dict[tag] = vector_counter
#     vector_counter += 1

# ratings_df_removed = pd.read_csv('data/ratings_df_last_liked_movie_removed.csv')

# user_list = list(set(ratings_df_removed.userId))

# like_dislike_tags = pd.DataFrame()
# index_counter = 0

# progress_counter_1 = 0
# progress_counter_2 = 5
# start_time = datetime.datetime.now()
# print('Start Time:', start_time)

# for user in user_list:
#     progress_counter_1 += 1

#     temp_ratings_df = ratings_df_removed[ratings_df_removed.userId == user]
#     like_tags_df = pd.DataFrame()
#     dislike_tags_df = pd.DataFrame()
        
#     for index, row in temp_ratings_df.iterrows():  # Creating tags for each user
#         try: # This is to check if the movie tags exist
#             if row.rating >= 4: # Like
#                 temp_movie_df = pd.read_csv('data/movie_tags/{}.csv'.format(str(int(row.movieId)))) # This oddly turns the movieId into a float, most likely to match the other data types in the selected series

#                 if len(like_tags_df) == 0:
#                     like_tags_df = temp_movie_df

#                 else:
#                     like_tags_df = pd.concat([like_tags_df, temp_movie_df], ignore_index= True)

#             else:
#                 temp_movie_df = pd.read_csv('data/movie_tags/{}.csv'.format(str(int(row.movieId))))

#                 if len(like_tags_df) == 0:
#                     dislike_tags_df = temp_movie_df

#                 else:
#                     dislike_tags_df = pd.concat([dislike_tags_df, temp_movie_df], ignore_index= True)
#         except Exception:
#             pass
                
#     # Counting all tags
#     try:  # This is to check if the user has movies they've liked or disliked. Users who only have liked movies will be skipped (example: userId 173)
#         like_tags_list = list(like_tags_df.tag)
#         dislike_tags_list = list(dislike_tags_df.tag)
#     except Exception:
#         continue
    
#     like_dict = {}
#     dislike_dict = {}
    
#     for tag in like_tags_list:
#         like_dict[tag] = like_tags_list.count(tag) * -1  # This is multiple by -1 to convert it to a negative numerical count for the sorting that will be done next
    
#     for tag in dislike_tags_list:
#         dislike_dict[tag] = dislike_tags_list.count(tag) * -1
        
#     # Sorting the dictionary by the tag counts (smallest to largest is by default and simplest; in this case, the multiplication by -1 makes the tags with the largest counts the first in the sorted list)
#     like_tags_counted = sorted(like_dict, key= lambda tag: like_dict[tag])  # Returns a list of the tags
#     dislike_tags_counted = sorted(dislike_dict, key= lambda tag: dislike_dict[tag])
    
#     # Converting the tags to vectorized tags but only for the first 50 tags from the like and dislike tags counted lists
#     like_tags_vectorized = []
#     dislike_tags_vectorized = []
    
#     if len(like_tags_counted) < 50:  # Checking to make sure there is 50 tags in the counted lists
#         num_like_tags = len(like_tags_counted)
#     else:
#         num_like_tags = 50
        
#     if len(dislike_tags_counted) < 50: 
#         num_dislike_tags = len(like_tags_counted)
#     else:
#         num_dislike_tags = 50
    
#     for tag in like_tags_counted[:num_like_tags]:
#         try:  # The tag might not exist in the vectorized dictionary
#             tag_vector = vectorized_dict[tag]
#             like_tags_vectorized.append(tag_vector)
#         except Exception:
#             pass
        
#     for tag in dislike_tags_counted[:num_dislike_tags]:
#         try:
#             tag_vector = vectorized_dict[tag]
#             dislike_tags_vectorized.append(tag_vector)
#         except Exception:
#             pass
        
#     if len(like_tags_vectorized) < 20 or len(dislike_tags_vectorized) < 20:
#         continue  # If any of the two are not 20 tags in length, then the user will be skipped
    
#     # Obtaining the most liked and disliked tags, 20 tags each, and adding it to like_dislike_tags:
#     like_dislike_dict = {}
    
#     like_dislike_dict['userId'] = user
    
#     for x in range(20):
#         like_dislike_dict['LIKE_' + str(x)] = like_tags_vectorized[x]
#         like_dislike_dict['DISLIKE_' + str(x)] = dislike_tags_vectorized[x]
    
#     concat_df = pd.DataFrame(like_dislike_dict, index=[0])
    
#     if len(like_dislike_tags) == 0:
#         like_dislike_tags = concat_df
    
#     else:
#         like_dislike_tags = pd.concat([like_dislike_tags, concat_df], ignore_index= True)
    
#     if (progress_counter_1 / len(user_list)) * 100 >= progress_counter_2:
#         print((progress_counter_1 / len(user_list)) * 100, '% completed')
#         print('Processing Time:', datetime.datetime.now() - start_time)
#         print('Current Time:', datetime.datetime.now())
#         progress_counter_2 += 5

# like_dislike_tags = like_dislike_tags.astype('int64')
# like_dislike_tags.to_csv('data/final/like_dislike_tags.csv', index = False)
# with open('data/vectorized_dict.pkl', 'wb') as writer:
#     # Saving the vectorized tag dictionary as a pickle file; this is the reference to know which vector is associated to the tag!! (string)
#     pickle.dump(vectorized_dict, writer)

In [80]:
# Creating a movie tags profile to complement the user tags:
if os.path.exists('data/final/') != True:
    os.mkdir('data/final/')
    
movies_df_mod = pd.read_csv('data/movies_mod.csv')
movieId_list = list(movies_df_mod.movieId)
del movies_df_mod

movie_tags_df = pd.DataFrame()
index_counter = 0

progress_counter_1 = 0
progress_counter_2 = 5

with open('data/vectorized_dict.pkl', 'rb') as reader:
    vectorized_dict = pickle.load(reader)

for movie in movieId_list:
    progress_counter_1 += 1

    try:
        temp_df = pd.read_csv('data/movie_tags/{}.csv'.format(movie))  # The tags are already in order of most counts and then alphabetically

        if len(temp_df) < 5: # Skipping movies with less than 5 tags
            continue 

        vectorized_tag = []
        movie_tags = list(temp_df.tag)

        for tag in movie_tags:
            try:
                tag_vector = vectorized_dict[tag]
                vectorized_tag.append(tag_vector)
            except Exception:
                pass

        if len(vectorized_tag) < 5: # Skipping movies with less than 5 common tags; The first similar if statement is not needed but is placed for performance purposes
            continue 

        movie_tags_df.loc[index_counter, 'movieId'] = movie

        for x in range(5):
            movie_tags_df.loc[index_counter, 'TAG_' + str(x)] = vectorized_tag[x]
            
        index_counter += 1
            
    except Exception:
        pass
    
    if (progress_counter_1 / len(movieId_list)) * 100 >= progress_counter_2:
        print((progress_counter_1 / len(movieId_list)) * 100, '% completed')
        progress_counter_2 += 5

movie_tags_df.to_csv('data/final/movie_tags_df.csv', index = False)



5.000771724031487 % completed
10.001543448062973 % completed
15.00231517209446 % completed
20.003086896125946 % completed
25.01929310078716 % completed
30.000771724031488 % completed
35.005402068220405 % completed
40.00617379225189 % completed
45.00308689612594 % completed
50.0 % completed
55.00077172403148 % completed
60.0169779286927 % completed
65.00231517209446 % completed
70.00308689612595 % completed
75.0 % completed
80.00077172403148 % completed
85.00154344806298 % completed
90.00231517209446 % completed
95.00694551628338 % completed
100.0 % completed


In [81]:
import shutil

# delete files, which is not usefull for us in the subsequent code
shutil.rmtree('data/movie_tags/', ignore_errors=True)
shutil.rmtree('data/user_tags/', ignore_errors=True)
os.remove('data/common_tags.csv')