# **1. Import the required libraries.**

In [148]:
'''Necessary libraries.'''
import pandas as pd                 # Pandas for tabular data manipulation.
import json                         # Module for working with JSON.
import ast                          # Module for evaluating Python literal expressions.
import re                           # Module for working with regular expressions.
from textblob import TextBlob       # I import TextBlob from the textblob library.
import nltk                         # Natural Language Toolkit.
import csv                          # I import the CSV module into Python.

'''Enable auto-reload of modules before executing a cell'''
%load_ext autoreload
%autoreload 2

'''Import the warning module and set it to ignore all warnings'''
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# **2. Auxiliary Functions**

In [149]:
def check_data_type(df):
    
    '''A dictionary (my_dict) is created with five keys: "field_name", "data_type", "non_null_%", "null_%" and "nulls". 
    These keys will be used to store information about each column of the DataFrame.'''
    
    my_dict = {"field_name" : [], "data_type" : [], "non_null_%" : [], "null_%" : [], "nulls" : []}
    
    
    '''A loop is performed over all the columns of the DataFrame df'''
    for column in df.columns:
        percentage_non_nulls = (df[column].count() / len(df) * 100)     # The percentage of non-null values in the current column is calculated 
        my_dict['field_name'].append(column)                            # The current column is added to the list under the key 'field_name'        
        my_dict['data_type'].append(df[column].apply(type).unique())    # Obtain the unique data type in the current column and add it to the list under the key 'data type'.
        my_dict['non_null_%'].append(round(percentage_non_nulls, 2))    # Add the percentage of non-null values to the list under the key 'non_null_%'.
        my_dict['null_%'].append(round(100 - percentage_non_nulls, 2))  # Add the percentage of null values to the list under the key 'null_%'.
        my_dict['nulls'].append(df[column].isnull().sum())              # Add the number of null values in the current column to the list under the key 'nulls'.
        
    '''The dictionari my_dict is used to create a new DataFrame called df_info.'''
    df_info = pd.DataFrame(my_dict)
        
    '''The function returns the DataFrame df_info containing information about each column,
    incluiding the column name, data type, percentage of non-null values, percentage of null values, and the number of null values.'''
    return df_info

In [150]:
def check_duplicates_by_columns(df, column):
    
    '''Duplicate rows are filtered'''
    duplicated_rows = df[df.duplicated(subset=column, keep=False)]
    if duplicated_rows.empty:
        return 'There are no duplicates'
    
    '''The duplicate rows are sorted for comparison'''
    duplicated_rows_sorted = duplicated_rows.sort_values(by=column)
    return duplicated_rows_sorted

# **Australian_user_reviews_clean Preparation.**

In [151]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Australian_user_reviews_clean1.csv', 'r') as file:
    csv_file = csv.DictReader(file)
    df_reviews = pd.DataFrame(csv_file)

In [152]:
check_data_type(df_reviews)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews_item_id,[<class 'str'>],100.0,0.0,0
3,reviews_helpful,[<class 'str'>],100.0,0.0,0
4,reviews_recommend,[<class 'str'>],100.0,0.0,0
5,reviews_date,[<class 'str'>],100.0,0.0,0
6,sentiment_analysis,[<class 'str'>],100.0,0.0,0


In [153]:
df_reviews.sample(5, random_state=5)

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
10713,goodjobyoufoundmyaccount,http://steamcommunity.com/id/goodjobyoufoundmy...,107410,No ratings yet,True,14-06-22,0
23420,76561198085743341,http://steamcommunity.com/profiles/76561198085...,239030,1 of 21 people (5%) found this review helpful,False,13-12-07,1
39019,robsonrrs,http://steamcommunity.com/id/robsonrrs,4000,1 of 1 people (100%) found this review helpful,True,13-08-12,2
2802,DjinnieContact,http://steamcommunity.com/id/DjinnieContact,281990,3 of 6 people (50%) found this review helpful,True,Invalid format,2
3781,5554Grovesy,http://steamcommunity.com/id/5554Grovesy,24200,No ratings yet,True,13-12-26,1


In [154]:
'''Column reviews_item_id is renamed to item_id'''
new_name = 'item_id'
df_reviews = df_reviews.rename(columns={'reviews_item_id': new_name})

In [155]:
'''Count how many invalid formats there are in column reviews_date'''

count_data_to_search = (df_reviews['reviews_date'] == 'Invalid format').sum()

print(f'There are {count_data_to_search} invalid data.')

There are 9768 invalid data.


In [156]:
'''The Invalid format is changed to the date 2000-01-01'''
df_reviews['reviews_date'].replace('Invalid format', '2000-01-01', inplace=True)

In [157]:
'''Count how many invalid formats there are in column reviews_date'''

count_data_to_search = (df_reviews['reviews_date'] == 'Invalid format').sum()

print(f'There are {count_data_to_search} invalid data.')

There are 0 invalid data.


# **Output_steam_games_clean Preparation.**

In [158]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Output_steam_games_clean.csv', 'r', encoding='utf-8') as file:
    csv_file = csv.DictReader(file)
    df_games = pd.DataFrame(csv_file)

In [159]:
check_data_type(df_games)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,genres,[<class 'str'>],100.0,0.0,0
1,price,[<class 'str'>],100.0,0.0,0
2,early_access,[<class 'str'>],100.0,0.0,0
3,id,[<class 'str'>],100.0,0.0,0
4,release_year,[<class 'str'>],100.0,0.0,0
5,publisher,[<class 'str'>],100.0,0.0,0
6,app_name,[<class 'str'>],100.0,0.0,0
7,title,[<class 'str'>],100.0,0.0,0
8,developer,[<class 'str'>],100.0,0.0,0


In [160]:
df_games.sample(5, random_state=5)

Unnamed: 0,genres,price,early_access,id,release_year,publisher,app_name,title,developer
9793,Simulation,4.99,False,368230,2015,Raw Fury,Kingdom: Classic,Kingdom: Classic,"Noio,Licorice"
29858,Simulation,2.99,False,637802,2017,No data available,Rocksmith® 2014 Edition – Remastered – Alice i...,Rocksmith® 2014 Edition – Remastered – Alice i...,Ubisoft - San Francisco
41055,RPG,2.99,False,608200,2017,No data available,Fantasy Grounds - Mini-Dungeon #025: The Choke...,Fantasy Grounds - Mini-Dungeon #025: The Choke...,"SmiteWorks USA, LLC"
42178,Action,4.99,False,416530,2017,Cherry Pie Games,Tacopocalypse,Tacopocalypse,Cherry Pie Games
3952,Racing,4.99,False,279520,2014,Plug In Digital,Rage Runner,Rage Runner,Hypercane Studios


In [161]:
'''Column id is renamed to item_id'''
new_name = 'item_id'
df_games = df_games.rename(columns={'id': new_name})

In [162]:
df_games.columns

Index(['genres', 'price', 'early_access', 'item_id', 'release_year',
       'publisher', 'app_name', 'title', 'developer'],
      dtype='object')

# **Australian_users_items_clean Preparation.**

In [163]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Australian_users_items_clean.csv', 'r', encoding='utf-8') as file:
    csv_file = csv.DictReader(file)
    df_items = pd.DataFrame(csv_file)

In [164]:
check_data_type(df_items)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,item_name,[<class 'str'>],100.0,0.0,0
2,playtime_forever,[<class 'str'>],100.0,0.0,0
3,steam_id,[<class 'str'>],100.0,0.0,0
4,items_count,[<class 'str'>],100.0,0.0,0
5,user_id,[<class 'str'>],100.0,0.0,0
6,user_url,[<class 'str'>],100.0,0.0,0


In [165]:
df_items.sample(5, random_state=5)

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url
712918,233720,Surgeon Simulator,355,76561198069744476,101,bl00dlock,http://steamcommunity.com/id/bl00dlock
4648151,205790,Dota 2 Test,0,76561198067869898,93,76561198067869898,http://steamcommunity.com/profiles/76561198067...
2757740,285580,ACE - Arena: Cyber Evolution,0,76561198050861063,187,76561198050861063,http://steamcommunity.com/profiles/76561198050...
713376,32420,STAR WARS™: The Clone Wars - Republic Heroes™,0,76561198083573232,256,BlackTheKing,http://steamcommunity.com/id/BlackTheKing
3733342,250380,Knock-knock,0,76561197973612806,661,gbl_scarface,http://steamcommunity.com/id/gbl_scarface


# **DataFrames**

In [166]:
df_games.head(1)

Unnamed: 0,genres,price,early_access,item_id,release_year,publisher,app_name,title,developer
0,Action,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [167]:
df_items.head(1)

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url
0,10,Counter-Strike,6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [168]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,11-11-05,1


# **Endpoint # 1**

A DataFrame is created for endpoint 1 called 'df_endpoint_1'

def developer(developer: str): Number of items and percentage of free content per year, according to development company.
Return example:

| year | number of items | percentage free |
|---|---|---|
| 2023 | 50 | 27% |
| 2022 | 45 | 25% |
| xxxx | xx | xx% |

In [199]:
df1_aux1 = df_games[['developer', 'item_id', 'price', 'release_year']]

In [200]:
df1_aux1.sample(10, random_state=15)

Unnamed: 0,developer,item_id,price,release_year
52600,Minddesk Software GmbH,456231,9.99,2016
2941,Exor Studios,220825,1.99,2013
52864,Choice of Games,459330,0.99,2016
27143,Ubisoft Montreal,662352,6.99,2017
32008,Wurd Industries,488480,4.99,2017
4336,SCS Software,304213,0.99,2014
6197,Battenberg Software,341090,3.99,2015
33379,Dave Gumble,717190,0.0,2017
51234,Handelabra Games Inc.,467516,4.99,2016
68286,Stoic,234023,9.99,2013


In [201]:
'''Count the number of occurrences of "Data not available" in the "release_year" column.'''
count_data_search = (df1_aux1['release_year'] == 'Data not available').sum()

print(f'Number of times it appears: {count_data_search}')

Number of times it appears: 0


The columns "total_items" and "free_items" are added.

In [202]:
df = pd.DataFrame(df1_aux1)

'''Convert "price" column to string to compare with '0.00'''
df['price'] = df['price'].astype(str)


'''Group by developer and year, and count the total number of items and the number of free items.'''
df1_aux2 = df.groupby(['developer', 'release_year']).agg(
    total_items=('item_id', 'count'),
    free_items=('price', lambda x: (x == '0.00').sum())
).reset_index()

df1_aux2


Unnamed: 0,developer,release_year,total_items,free_items
0,+7 Software,2016,4,0
1,"+Mpact Games, LLC.",2017,3,0
2,.M.Y.W.,2016,2,0
3,.ez Games,2017,5,0
4,07th Expansion,2015,2,0
...,...,...,...,...
15032,萌石游戏,2017,4,0
15033,高考恋爱委员会,2015,4,0
15034,"高考恋爱委员会,Days",2015,4,0
15035,"高考恋爱委员会,橘子班",2015,4,0


percentage_free column is added by year

In [203]:
df1_aux3 = pd.DataFrame(df1_aux2)

'''Calculate the percentage of free items in the total and add the "%" sign'''
df1_aux3['percentage_free'] = (df1_aux3['free_items'] / df1_aux3['total_items']) * 100
df1_aux3['percentage_free'] = df1_aux3['percentage_free'].map('{:.2f}%'.format)

df1_aux3.reset_index(drop=True)

Unnamed: 0,developer,release_year,total_items,free_items,percentage_free
0,+7 Software,2016,4,0,0.00%
1,"+Mpact Games, LLC.",2017,3,0,0.00%
2,.M.Y.W.,2016,2,0,0.00%
3,.ez Games,2017,5,0,0.00%
4,07th Expansion,2015,2,0,0.00%
...,...,...,...,...,...
15032,萌石游戏,2017,4,0,0.00%
15033,高考恋爱委员会,2015,4,0,0.00%
15034,"高考恋爱委员会,Days",2015,4,0,0.00%
15035,"高考恋爱委员会,橘子班",2015,4,0,0.00%


The dataset is saved to be consumed by the endpoint 1finction.

In [204]:
dataset_endpoint_1 = df1_aux3

In [205]:


saved_address = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_1.csv'

dataset_endpoint_1.to_csv(saved_address, index=False)


In [206]:
dataset_endpoint_1.head()

Unnamed: 0,developer,release_year,total_items,free_items,percentage_free
0,+7 Software,2016,4,0,0.00%
1,"+Mpact Games, LLC.",2017,3,0,0.00%
2,.M.Y.W.,2016,2,0,0.00%
3,.ez Games,2017,5,0,0.00%
4,07th Expansion,2015,2,0,0.00%


# **Endpoint #2**

def userdata(user_id : str): The output should incluide: the total amount spent by the user, the recommendation percentage based on reviews.recommends, and the number of items.

Example return: {'user x':user22, 'money spent':200 USD, 'recommendation percentage':20%, 'number of items':5}

An auxiliary DtaFrame is generated.

In [207]:
'''The columns are selected to create a "df2_aux1" DataFrame.'''
selected_columns = ['item_id', 'price']
df2_aux1 = df_games[selected_columns]
df2_aux1.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,item_id,price
0,368230,4.99
1,637802,2.99
2,608200,2.99
3,416530,4.99
4,279520,4.99


In [208]:
'''The "price" column is converted to a float data type.'''
df2_aux1['price'] = df2_aux1['price'].astype(float)

In [209]:
'''The columns are selected to create a "df2_aux2" DataFrame.'''
selected_columns = ['item_id', 'items_count', 'user_id']
df2_aux2 = df_items[selected_columns]
df2_aux2 = df2_aux2.sample(frac=0.05, random_state=5).reset_index(drop=True)

In [180]:
'''The "items_count" column is converted to a float data type.'''
df2_aux2['items_count'] = df2_aux2['items_count'].astype(float)

A merge is performed between datasets df2_aux1 and df2_aux2

In [181]:
'''The merge is perfomed based on the item_id'''
df2_aux3 = pd.merge(df1_aux1, df2_aux2, on='item_id', how='left')

'''Delete rows with null values'''
df2_aux3 = df2_aux3.dropna()

In [182]:
df2_aux3.sample(5, random_state=5)

Unnamed: 0,developer,item_id,price,release_year,items_count,user_id
103011,Cryptic Studios,109600,0.0,2013,132.0,76561198055414504
336511,Croteam,257510,39.99,2014,303.0,AFatDwarf
33567,Electronic Arts,47790,19.99,2010,323.0,2spooky
193223,Ludeon Studios,294100,29.99,2016,187.0,76561198033248600
406614,Bohemia Interactive,107410,39.99,2013,48.0,76561198062558066


In [183]:
df2_aux3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 493531 entries, 79 to 549670
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   developer     493531 non-null  object 
 1   item_id       493531 non-null  object 
 2   price         493531 non-null  object 
 3   release_year  493531 non-null  object 
 4   items_count   493531 non-null  float64
 5   user_id       493531 non-null  object 
dtypes: float64(1), object(5)
memory usage: 26.4+ MB


Create the "total_amount_spent" column to determine how much each user spent.

In [184]:
df2_aux3['price'] = df2_aux3['price'].astype(float)

In [185]:
df2_aux3['total_amount_spent'] = df2_aux3['price'] * df2_aux3['items_count']

In [186]:
df2_aux3.sample(5, random_state=5).reset_index(drop=True)

Unnamed: 0,developer,item_id,price,release_year,items_count,user_id,total_amount_spent
0,Cryptic Studios,109600,0.0,2013,132.0,76561198055414504,0.0
1,Croteam,257510,39.99,2014,303.0,AFatDwarf,12116.97
2,Electronic Arts,47790,19.99,2010,323.0,2spooky,6456.77
3,Ludeon Studios,294100,29.99,2016,187.0,76561198033248600,5608.13
4,Bohemia Interactive,107410,39.99,2013,48.0,76561198062558066,1919.52


We'll start by working with the "reviews_recommend" column to determine the percentage of users recommending games.

In [187]:
'''The columns are selected to create a "df2_aux4" DataFrame.'''
selected_columns = ['user_id', 'reviews_recommend']
df2_aux4 = df_reviews[selected_columns]
df2_aux4.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,user_id,reviews_recommend
0,goodjobyoufoundmyaccount,True
1,76561198085743341,False
2,robsonrrs,True
3,DjinnieContact,True
4,5554Grovesy,True


We'll check the "reviews_recommend" column to see what type of values it contains.

In [188]:
unique_values = df2_aux4['reviews_recommend'].unique()

unique_values

array(['True', 'False'], dtype=object)

In [189]:
df2_aux4['reviews_recommend'] = df2_aux4['reviews_recommend'].replace({'True': True, 'False': False, '': False}).astype(bool)

Let's convert the values to boolean.

In [190]:
df2_aux4['reviews_recommend'] = df2_aux4['reviews_recommend'].astype(bool)


In [191]:
unique_values = df2_aux4['reviews_recommend'].unique()

unique_values

array([ True, False])

In [192]:

unique_values.dtype

dtype('bool')

In [193]:
df2_aux4.sample(10, random_state=5).reset_index(drop=True)

Unnamed: 0,user_id,reviews_recommend
0,goodjobyoufoundmyaccount,True
1,76561198085743341,False
2,robsonrrs,True
3,DjinnieContact,True
4,5554Grovesy,True
5,76561198105239601,True
6,76561198094438867,True
7,76561197963459411,True
8,duckonthetruck,True
9,76561198068777697,False


Let's work on the dataframe to calculate the percentage of 'True' values for each users.

In [194]:
df2_aux5 = pd.DataFrame(df2_aux4)

'''Convert the "reviews_recommend" column to boolean type'''
df2_aux5['reviews_recommend'] = df2_aux5['reviews_recommend'].astype(bool)

'''Create columns "total_true" and "total_false"'''
df2_aux5['total_true'] = df2_aux5['reviews_recommend'].astype(int)
df2_aux5['total_false'] = (~df2_aux5['reviews_recommend']).astype(int)

'''Group by "user_id" and sum the corresponding columns.'''
df2_aux6 = df2_aux5.groupby('user_id').agg({'total_true': 'sum', 'total_false': 'sum'})

'''Create the column "total_recommend.'''
df2_aux6['total_recommend'] = df2_aux6['total_true'] + df2_aux6['total_false']

'''Create the column "percentage_true" and add the percentage sing'''
df2_aux6['percentage_true'] = (df2_aux6['total_true'] / df2_aux6['total_recommend']) * 100
df2_aux6['percentage_true'] = df2_aux6['percentage_true'].map('{:.2f}%'.format)

df2_aux6.sample(10,random_state=3)

Unnamed: 0_level_0,total_true,total_false,total_recommend,percentage_true
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
recognise,1,0,1,100.00%
subordermanagement,6,1,7,85.71%
76561198041199233,3,0,3,100.00%
captainmehico,1,0,1,100.00%
rekkler,2,0,2,100.00%
tomanymemesrly,2,0,2,100.00%
76561198326180109,1,0,1,100.00%
aafaqvirk,1,0,1,100.00%
76561198078072214,1,0,1,100.00%
76561198078573640,2,0,2,100.00%


DataFrames 'df2_aux3' and 'df2_aux6' are joined and the final DataFrame is created.


In [195]:
'''Perfom the merge'''
df2_aux7 = pd.merge(df2_aux3, df2_aux6, on='user_id', how='inner')


df2_aux7.sample(5,random_state=5)

Unnamed: 0,developer,item_id,price,release_year,items_count,user_id,total_amount_spent,total_true,total_false,total_recommend,percentage_true
81344,"Psyonix, Inc.",252950,19.99,2015,118.0,76561197964772156,2358.82,3,0,3,100.00%
246754,Hothead Games,18000,7.99,2008,234.0,murdadar,1869.66,1,0,1,100.00%
237398,Ubisoft Montreal,33230,19.99,2010,154.0,76561198065514797,3078.46,3,0,3,100.00%
1007,Outerlight Ltd.,2420,9.99,2006,82.0,SLAIAX_SLYAXE,819.18,2,0,2,100.00%
85691,Hi-Rez Studios,386360,0.0,2015,103.0,76561198090478432,0.0,1,1,2,50.00%


Columns that are not useful are eliminated.  
"total_true", total_false", "total_recommend", "price" and item_count"

In [196]:

columns_to_drop = ['price', 'items_count', 'total_true', 'total_false', 'total_recommend']      # List of columns to delete.


df2_aux7 = df2_aux7.drop(columns=columns_to_drop, axis=1)                                               # Delete the columns.

df2_aux7.sample(5,random_state=5)

Unnamed: 0,developer,item_id,release_year,user_id,total_amount_spent,percentage_true
81344,"Psyonix, Inc.",252950,2015,76561197964772156,2358.82,100.00%
246754,Hothead Games,18000,2008,murdadar,1869.66,100.00%
237398,Ubisoft Montreal,33230,2010,76561198065514797,3078.46,100.00%
1007,Outerlight Ltd.,2420,2006,SLAIAX_SLYAXE,819.18,100.00%
85691,Hi-Rez Studios,386360,2015,76561198090478432,0.0,50.00%


The dataset is saved to be consumed by the endpoint2 fuction.

In [211]:
dataset_endpoint_2 = df2_aux7


saved_address = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_2.csv'

dataset_endpoint_2.to_csv(saved_address, index=False)


In [212]:
dataset_endpoint_2.head()

Unnamed: 0,developer,item_id,release_year,user_id,total_amount_spent,percentage_true
0,Stainless Games Ltd,282010,1997,larrythelawnmower,2407.59,100.00%
1,Stainless Games Ltd,282010,1997,76561198018834500,8641.35,100.00%
2,Stainless Games Ltd,282010,1997,larrythelawnmower,2407.59,100.00%
3,Stainless Games Ltd,282010,1997,76561198018834500,8641.35,100.00%
4,Stainless Games Ltd,282010,1997,larrythelawnmower,2407.59,100.00%


# **Endpoint #3**

def UserForGenre(genre : str ): Should return the user who accumulates the most hours played for the given genre and a list of the accumulation of hours played by year of release.  

Return example: {'user with the most hours played for genre x':user22, 'hours played':[[year 2013, hours:203]]}

df_games auxiliaries

In [214]:
df_games.head(1)

Unnamed: 0,genres,price,early_access,item_id,release_year,publisher,app_name,title,developer
0,Action,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [215]:
'''The columns are selected to create a "df3_aux1" DataFrame.'''
selected_columns = ['genres', 'item_id', 'release_year']
df3_aux1 = df_games[selected_columns]
df3_aux1.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,genres,item_id,release_year
0,Simulation,368230,2015
1,Simulation,637802,2017
2,RPG,608200,2017
3,Action,416530,2017
4,Racing,279520,2014


df_items auxiliaries

In [216]:
df_items.head(1)

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url
0,10,Counter-Strike,6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [218]:
'''The columns are selected to create a "df3_aux2" DataFrame.'''
selected_columns = ['item_id', 'playtime_forever', 'user_id']
df3_aux2 = df_items[selected_columns]
df3_aux2.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,item_id,playtime_forever,user_id
0,233720,355,bl00dlock
1,205790,0,76561198067869898
2,285580,0,76561198050861063
3,32420,0,BlackTheKing
4,250380,0,gbl_scarface


For practical purposes for the project, I use only 0.5% of the df_items data.

In [219]:
df3_aux3 = df3_aux2.sample(frac=0.05,random_state=5).reset_index(drop=True)


The two dataframes df3_aux1 and df3_aux3 are joined with the merge function through item_id

In [221]:
df3_aux4 = pd.merge(df3_aux3, df3_aux1[['item_id', 'genres']], on='item_id', how='left')


df3_aux4.sample(5,random_state=5)



Unnamed: 0,item_id,playtime_forever,user_id,genres
351635,105600,1631,Ruinedunation,Indie
309474,304050,1,76561198035011178,Casual
243813,219640,1305,76561198064631460,Action
170708,104900,85,platitudinal,Adventure
325739,247730,474,76561198081651404,RPG


Data type are checked

In [222]:
check_data_type(df3_aux4)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,playtime_forever,[<class 'str'>],100.0,0.0,0
2,user_id,[<class 'str'>],100.0,0.0,0
3,genres,"[<class 'str'>, <class 'float'>]",91.34,8.66,46817


The genres column is searched for null values and removed.

In [223]:
df3_aux4 = df3_aux4.dropna(subset=['genres'])

In [224]:
check_data_type(df3_aux4)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,playtime_forever,[<class 'str'>],100.0,0.0,0
2,user_id,[<class 'str'>],100.0,0.0,0
3,genres,[<class 'str'>],100.0,0.0,0


In [225]:
df3_aux4.sample(5,random_state=5)


Unnamed: 0,item_id,playtime_forever,user_id,genres
110901,730,294,76561198018575249,Action
311516,349700,0,76561198090881058,Massively Multiplayer
36458,252410,0,DM420,Indie
202045,202750,0,76561197970723780,Adventure
385646,376570,87,Uve-Seen_Nothing,Free to Play


We add the release_year column to the dataframe.

In [227]:
'''Combine both DataFrames based on the "item_id" column.'''
df3_aux5 = pd.merge(df3_aux4, df3_aux1[['item_id', 'release_year']], on='item_id', how='left')


For practical purposes only 30% of the data is used.

In [229]:
'''The resulting DataFrame is displayed.'''
df3_aux6 = df3_aux5.sample(frac=0.3,random_state=5)


In [230]:
check_data_type(df3_aux6)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,playtime_forever,[<class 'str'>],100.0,0.0,0
2,user_id,[<class 'str'>],100.0,0.0,0
3,genres,[<class 'str'>],100.0,0.0,0
4,release_year,[<class 'str'>],100.0,0.0,0


In [234]:
df3_aux6.head(2)

Unnamed: 0,item_id,playtime_forever,user_id,genres,release_year
596198,242050,3151,ryanirons2,Adventure,2013
1415822,214830,224,76561198029989793,RPG,2012


The 'playtime_forever' column is searched for only numbers.

In [237]:
'''Count how many times "Data not available" appears in the 'playtime_forever' column.'''
count_data_not_available = (df3_aux6['playtime_forever'] == 'Data not available').sum()


print(f"'Data not available' found {count_data_not_available} times in column 'playtime_forever'")


'Data not available' found 0 times in column 'playtime_forever'


The 'playtime_forever' column is passed to integer data type.

In [239]:
df3_aux6['playtime_forever'] = df3_aux6['playtime_forever'].astype(int)

check_data_type(df3_aux6)


Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,playtime_forever,[<class 'int'>],100.0,0.0,0
2,user_id,[<class 'str'>],100.0,0.0,0
3,genres,[<class 'str'>],100.0,0.0,0
4,release_year,[<class 'str'>],100.0,0.0,0


The dataset is saved to be consumed by the endpoint3 fuction.

In [240]:
dataset_endpoint_3 = df3_aux6


saved_address = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_3.csv'

dataset_endpoint_3.to_csv(saved_address, index=False)


In [241]:
dataset_endpoint_3.head()

Unnamed: 0,item_id,playtime_forever,user_id,genres,release_year
596198,242050,3151,ryanirons2,Adventure,2013
1415822,214830,224,76561198029989793,RPG,2012
1490193,336510,0,76561198078986302,Free to Play,2013
1027389,420,0,76561198106609214,Action,2007
121755,8980,5603,76561198003528524,Action,2009


# **Endpoint #4**

def best_developer_year(year : int): Return the top 3 developer by most recommended games per user for the given year.  
Return example: [('position1':X),('position2':Y), ('position3':Z)]

df_reviews auxiliaries

In [242]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,11-11-05,1


In [243]:
'''The columns are selected to create a "df4_aux1" DataFrame.'''
selected_columns = ['item_id', 'reviews_recommend', 'sentiment_analysis']
df4_aux1 = df_reviews[selected_columns]
df4_aux1.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,item_id,reviews_recommend,sentiment_analysis
0,107410,True,0
1,239030,False,1
2,4000,True,2
3,281990,True,2
4,24200,True,1


df_games auxiliaries

In [244]:
df_games.head(1)

Unnamed: 0,genres,price,early_access,item_id,release_year,publisher,app_name,title,developer
0,Action,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [245]:
'''The columns are selected to create a "df4_aux2" DataFrame.'''
selected_columns = ['item_id', 'release_year', 'developer']
df4_aux2 = df_games[selected_columns]
df4_aux2.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,item_id,release_year,developer
0,368230,2015,"Noio,Licorice"
1,637802,2017,Ubisoft - San Francisco
2,608200,2017,"SmiteWorks USA, LLC"
3,416530,2017,Cherry Pie Games
4,279520,2014,Hypercane Studios


In [246]:
'''Merge is performed and rows with null values are removed.'''
df4_aux3 = pd.merge(df4_aux1, df4_aux2, on='item_id', how='inner').dropna()


In [247]:
'''The index is reset'''
df4_aux3.reset_index(drop=True, inplace=True)

df4_aux3.sample(5,random_state=5)

Unnamed: 0,item_id,reviews_recommend,sentiment_analysis,release_year,developer
60657,223470,True,2,2003,Running With Scissors
120227,204300,True,0,2012,Ronimo Games
56921,304930,True,2,2017,Smartly Dressed Games
93879,311210,False,1,2015,Treyarch
88094,4000,True,1,2006,Facepunch Studios


In [248]:
check_data_type(df4_aux3)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,reviews_recommend,[<class 'str'>],100.0,0.0,0
2,sentiment_analysis,[<class 'str'>],100.0,0.0,0
3,release_year,[<class 'str'>],100.0,0.0,0
4,developer,[<class 'str'>],100.0,0.0,0


The word 'Unavailable data' is searched in the 'release_year' column and those rows are deleted.

In [255]:
'''Count rows containing  "Unavailable data" in "release_year" column.'''
df_look = df4_aux3[df4_aux3['release_year'] == 'Unavailable data'].shape[0]


print(f"Number of rows of 'Unavailable data' in 'release_year': {df_look}")

Number of rows of 'Unavailable data' in 'release_year': 640


In [256]:
'''Replace "Unavailable data" with Nan and then delete rows with Nan in "release_year" column.'''
df4_aux3['release_year'] = df4_aux3['release_year'].replace('Unavailable data', pd.NA)
df4_aux4 = df4_aux3.dropna(subset=['release_year'])


df4_aux4.sample(5,random_state=5)

     

Unnamed: 0,item_id,reviews_recommend,sentiment_analysis,release_year,developer
45739,232090,True,1,2016,Tripwire Interactive
65715,274190,False,1,2015,Free Lives
47472,440,True,2,2007,Valve
28942,242760,True,0,2014,Endnight Games Ltd
12704,412880,True,1,2015,JDM4iK


In [257]:
check_data_type(df4_aux4)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,reviews_recommend,[<class 'str'>],100.0,0.0,0
2,sentiment_analysis,[<class 'str'>],100.0,0.0,0
3,release_year,[<class 'str'>],100.0,0.0,0
4,developer,[<class 'str'>],100.0,0.0,0


In [258]:

'''Convert "release_year" column to "int" data type.'''
df4_aux4['release_year'] = df4_aux4['release_year'].astype(int)

Only the columns "release_year", "sentiment_analysis" and "developer" are left.

In [259]:
df4_aux5 = df4_aux4[['release_year', 'sentiment_analysis', 'developer']]


In [260]:
df4_aux5.sample(5, random_state=5)

Unnamed: 0,release_year,sentiment_analysis,developer
45739,2016,1,Tripwire Interactive
65715,2015,1,Free Lives
47472,2007,2,Valve
28942,2014,0,Endnight Games Ltd
12704,2015,1,JDM4iK


In [261]:
check_data_type(df4_aux5)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,release_year,[<class 'int'>],100.0,0.0,0
1,sentiment_analysis,[<class 'str'>],100.0,0.0,0
2,developer,[<class 'str'>],100.0,0.0,0


In [262]:
'''Change the data type of the column "sentimental_analysis" to "int".'''
df4_aux5['sentiment_analysis'] = df4_aux5['sentiment_analysis'].astype(int)

In [263]:
check_data_type(df4_aux5)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,release_year,[<class 'int'>],100.0,0.0,0
1,sentiment_analysis,[<class 'int'>],100.0,0.0,0
2,developer,[<class 'str'>],100.0,0.0,0


In [264]:
df4_aux5.sample(5, random_state=5)

Unnamed: 0,release_year,sentiment_analysis,developer
45739,2016,1,Tripwire Interactive
65715,2015,1,Free Lives
47472,2007,2,Valve
28942,2014,0,Endnight Games Ltd
12704,2015,1,JDM4iK


The dataset is saved to be consumed by the endpoint4 fuction.

In [265]:
dataset_endpoint_4 = df4_aux5


saved_address = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_4.csv'

dataset_endpoint_4.to_csv(saved_address, index=False)


In [266]:
dataset_endpoint_4.head()

Unnamed: 0,release_year,sentiment_analysis,developer
0,2009,1,Tripwire Interactive
1,2013,2,"Hopoo Games, LLC"
2,2013,2,"Hopoo Games, LLC"
3,2013,2,"Hopoo Games, LLC"
4,2013,2,Telltale Games


# **Endpoint #5**

def developer_reviews_analysis(developer : str): Depending on the developer, a dictionary is returned with the developer's name as a key and a list with the total number of user review records that are categorized with a sentiment analysis as positive or negative value.  

Return example: {'Valve':[ Negative = 182, Positive = 278]}

df_reviews auxiliaries.

In [267]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,11-11-05,1


In [268]:
'''The columns are selected to create a "df5_aux1" DataFrame.'''
selected_columns = ['item_id',  'sentiment_analysis']
df5_aux1 = df_reviews[selected_columns]
df5_aux1.sample(5, random_state=5).reset_index(drop=True)


Unnamed: 0,item_id,sentiment_analysis
0,107410,0
1,239030,1
2,4000,2
3,281990,2
4,24200,1


df_games auxiliaries.

In [271]:
df_games.head(1)

Unnamed: 0,genres,price,early_access,item_id,release_year,publisher,app_name,title,developer
0,Action,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [272]:
'''The columns are selected to create a "df5_aux2" DataFrame.'''
selected_columns = ['item_id',  'developer']
df5_aux2 = df_games[selected_columns]
df5_aux2.sample(5, random_state=5).reset_index(drop=True)

Unnamed: 0,item_id,developer
0,368230,"Noio,Licorice"
1,637802,Ubisoft - San Francisco
2,608200,"SmiteWorks USA, LLC"
3,416530,Cherry Pie Games
4,279520,Hypercane Studios


In [273]:
'''Perform the merge'''
df5_aux3 = pd.merge(df5_aux1, df5_aux2, on='item_id')

'''Reset the index'''
df5_aux3.reset_index(drop=True, inplace=True)

'''Delete rows with null or empty values.'''
df5_aux3.dropna(inplace=True)

df5_aux3.sample(5)

Unnamed: 0,item_id,sentiment_analysis,developer
104983,203160,1,"Crystal Dynamics,Feral Interactive (Mac),Feral..."
99592,375510,2,Mint Age Studios
15259,242860,2,"M2H,Blackmill Games"
59651,440,1,Valve
2412,204300,2,Ronimo Games


In [274]:
check_data_type(df5_aux3)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,sentiment_analysis,[<class 'str'>],100.0,0.0,0
2,developer,[<class 'str'>],100.0,0.0,0


In [275]:
'''Convert "sentiment_analysis" column to int type.'''
df5_aux3['sentiment_analysis'] = df5_aux3['sentiment_analysis'].astype(int)


In [276]:
check_data_type(df5_aux3)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,sentiment_analysis,[<class 'int'>],100.0,0.0,0
2,developer,[<class 'str'>],100.0,0.0,0


In [277]:
'''As only the values ​​0 (negative) and 2 (positive) are needed. rows with values ​​of 1 (neutral) are deleted'''

df5_aux4 = df5_aux3[df5_aux3['sentiment_analysis'] != 1].reset_index(drop=True)


df5_aux4.sample(5)

Unnamed: 0,item_id,sentiment_analysis,developer
44409,304930,2,Smartly Dressed Games
14430,222730,0,Abbey Games
40394,339800,2,HuniePot
40994,323180,2,Valve
11598,417750,2,Volumetric Games


In [278]:
'''Only columns "sentiment_analysis" and "developer" are left. '''
df5_aux5 = df5_aux4[['sentiment_analysis', 'developer']]


In [279]:
check_data_type(df5_aux5)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,sentiment_analysis,[<class 'int'>],100.0,0.0,0
1,developer,[<class 'str'>],100.0,0.0,0


In [280]:
df5_aux5.head(5)

Unnamed: 0,sentiment_analysis,developer
0,2,"Hopoo Games, LLC"
1,2,"Hopoo Games, LLC"
2,2,"Hopoo Games, LLC"
3,2,Telltale Games
4,0,Traveller's Tales


The dataset is saved to be consumed by the endpoint4 fuction.

In [281]:
dataset_endpoint_5 = df5_aux5


saved_address = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_5.csv'

dataset_endpoint_5.to_csv(saved_address, index=False)


In [282]:
dataset_endpoint_5.head()

Unnamed: 0,sentiment_analysis,developer
0,2,"Hopoo Games, LLC"
1,2,"Hopoo Games, LLC"
2,2,"Hopoo Games, LLC"
3,2,Telltale Games
4,0,Traveller's Tales


# **Endpoint #6**

**Convert Everuthing to Parquet Format**

In [285]:
'''Load the CSV file into a DataFrame.'''
df = pd.read_csv('C:\\\\\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_1.csv')

'''Parquet file path.'''
parquet_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\Parquet_datasets\\dataset_endpoint_1.parquet'

'''save the dataframe in parquet format'''
df.to_parquet(parquet_path, index=False)

print(f'The parquet file was successfully saved in: {parquet_path}')

The parquet file was successfully saved in: C:\Users\migue\Optimizing Recommender Systems with an Advanced MLOps Pipeline\Datasets\Parquet_datasets\dataset_endpoint_1.parquet


In [286]:
'''Load the CSV file into a DataFrame.'''
df = pd.read_csv('C:\\\\\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_2.csv')

'''Parquet file path.'''
parquet_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\Parquet_datasets\\dataset_endpoint_2.parquet'

'''save the dataframe in parquet format'''
df.to_parquet(parquet_path, index=False)

print(f'The parquet file was successfully saved in: {parquet_path}')

The parquet file was successfully saved in: C:\Users\migue\Optimizing Recommender Systems with an Advanced MLOps Pipeline\Datasets\Parquet_datasets\dataset_endpoint_2.parquet


In [287]:
'''Load the CSV file into a DataFrame.'''
df = pd.read_csv('C:\\\\\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_3.csv')

'''Parquet file path.'''
parquet_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\Parquet_datasets\\dataset_endpoint_3.parquet'

'''save the dataframe in parquet format'''
df.to_parquet(parquet_path, index=False)

print(f'The parquet file was successfully saved in: {parquet_path}')

The parquet file was successfully saved in: C:\Users\migue\Optimizing Recommender Systems with an Advanced MLOps Pipeline\Datasets\Parquet_datasets\dataset_endpoint_3.parquet


In [288]:
'''Load the CSV file into a DataFrame.'''
df = pd.read_csv('C:\\\\\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_4.csv')

'''Parquet file path.'''
parquet_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\Parquet_datasets\\dataset_endpoint_4.parquet'

'''save the dataframe in parquet format'''
df.to_parquet(parquet_path, index=False)

print(f'The parquet file was successfully saved in: {parquet_path}')

The parquet file was successfully saved in: C:\Users\migue\Optimizing Recommender Systems with an Advanced MLOps Pipeline\Datasets\Parquet_datasets\dataset_endpoint_4.parquet


In [289]:
'''Load the CSV file into a DataFrame.'''
df = pd.read_csv('C:\\\\\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_5.csv')

'''Parquet file path.'''
parquet_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\Parquet_datasets\\dataset_endpoint_5.parquet'

'''save the dataframe in parquet format'''
df.to_parquet(parquet_path, index=False)

print(f'The parquet file was successfully saved in: {parquet_path}')

The parquet file was successfully saved in: C:\Users\migue\Optimizing Recommender Systems with an Advanced MLOps Pipeline\Datasets\Parquet_datasets\dataset_endpoint_5.parquet
