**1. Import the required libraries.**

In [28]:
'''Necessary libraries.'''
import pandas as pd                 # Pandas for tabular data manipulation.
import json                         # Module for working with JSON.
import ast                          # Module for evaluating Python literal expressions.
import re                           # Module for working with regular expressions.
from textblob import TextBlob       # I import TextBlob from the textblob library.
import nltk                         # Natural Language Toolkit.
import csv                          # I import the CSV module into Python.

'''Enable auto-reload of modules before executing a cell'''
%load_ext autoreload
%autoreload 2

'''Import the warning module and set it to ignore all warnings'''
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**2. Auxiliary Functions**

In [29]:
def check_data_type(df):
    
    '''A dictionary (my_dict) is created with five keys: "field_name", "data_type", "non_null_%", "null_%" and "nulls". 
    These keys will be used to store information about each column of the DataFrame.'''
    
    my_dict = {"field_name" : [], "data_type" : [], "non_null_%" : [], "null_%" : [], "nulls" : []}
    
    
    '''A loop is performed over all the columns of the DataFrame df'''
    for column in df.columns:
        percentage_non_nulls = (df[column].count() / len(df) * 100)     # The percentage of non-null values in the current column is calculated 
        my_dict['field_name'].append(column)                            # The current column is added to the list under the key 'field_name'        
        my_dict['data_type'].append(df[column].apply(type).unique())    # Obtain the unique data type in the current column and add it to the list under the key 'data type'.
        my_dict['non_null_%'].append(round(percentage_non_nulls, 2))    # Add the percentage of non-null values to the list under the key 'non_null_%'.
        my_dict['null_%'].append(round(100 - percentage_non_nulls, 2))  # Add the percentage of null values to the list under the key 'null_%'.
        my_dict['nulls'].append(df[column].isnull().sum())              # Add the number of null values in the current column to the list under the key 'nulls'.
        
    '''The dictionari my_dict is used to create a new DataFrame called df_info.'''
    df_info = pd.DataFrame(my_dict)
        
    '''The function returns the DataFrame df_info containing information about each column,
    incluiding the column name, data type, percentage of non-null values, percentage of null values, and the number of null values.'''
    return df_info

In [30]:
def check_duplicates_by_columns(df, column):
    
    '''Duplicate rows are filtered'''
    duplicated_rows = df[df.duplicated(subset=column, keep=False)]
    if duplicated_rows.empty:
        return 'There are no duplicates'
    
    '''The duplicate rows are sorted for comparison'''
    duplicated_rows_sorted = duplicated_rows.sort_values(by=column)
    return duplicated_rows_sorted

# **Australian_user_reviews_clean Preparation.**

In [31]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Australian_user_reviews_clean1.csv', 'r') as file:
    csv_file = csv.DictReader(file)
    df_reviews = pd.DataFrame(csv_file)

In [32]:
check_data_type(df_reviews)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews_item_id,[<class 'str'>],100.0,0.0,0
3,reviews_helpful,[<class 'str'>],100.0,0.0,0
4,reviews_recommend,[<class 'str'>],100.0,0.0,0
5,reviews_date,[<class 'str'>],100.0,0.0,0
6,sentiment_analysis,[<class 'str'>],100.0,0.0,0


In [33]:
df_reviews.sample(5, random_state=5)

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
10713,goodjobyoufoundmyaccount,http://steamcommunity.com/id/goodjobyoufoundmy...,107410,No ratings yet,True,14-06-22,0
23420,76561198085743341,http://steamcommunity.com/profiles/76561198085...,239030,1 of 21 people (5%) found this review helpful,False,13-12-07,1
39019,robsonrrs,http://steamcommunity.com/id/robsonrrs,4000,1 of 1 people (100%) found this review helpful,True,13-08-12,2
2802,DjinnieContact,http://steamcommunity.com/id/DjinnieContact,281990,3 of 6 people (50%) found this review helpful,True,Invalid format,2
3781,5554Grovesy,http://steamcommunity.com/id/5554Grovesy,24200,No ratings yet,True,13-12-26,1


In [34]:
'''Column reviews_item_id is renamed to item_id'''
new_name = 'item_id'
df_reviews = df_reviews.rename(columns={'reviews_item_id': new_name})

In [35]:
'''Count how many invalid formats there are in column reviews_date'''

count_data_to_search = (df_reviews['reviews_date'] == 'Invalid format').sum()

print(f'There are {count_data_to_search} invalid data.')

There are 9768 invalid data.


In [36]:
'''The Invalid format is changed to the date 2000-01-01'''
df_reviews['reviews_date'].replace('Invalid format', '2000-01-01', inplace=True)

In [37]:
'''Count how many invalid formats there are in column reviews_date'''

count_data_to_search = (df_reviews['reviews_date'] == 'Invalid format').sum()

print(f'There are {count_data_to_search} invalid data.')

There are 0 invalid data.


# **Output_steam_games_clean Preparation.**

In [38]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Output_steam_games_clean.csv', 'r', encoding='utf-8') as file:
    csv_file = csv.DictReader(file)
    df_games = pd.DataFrame(csv_file)

In [39]:
check_data_type(df_games)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,genres,[<class 'str'>],100.0,0.0,0
1,price,[<class 'str'>],100.0,0.0,0
2,early_access,[<class 'str'>],100.0,0.0,0
3,id,[<class 'str'>],100.0,0.0,0
4,release_year,[<class 'str'>],100.0,0.0,0
5,publisher,[<class 'str'>],100.0,0.0,0
6,app_name,[<class 'str'>],100.0,0.0,0
7,title,[<class 'str'>],100.0,0.0,0
8,developer,[<class 'str'>],100.0,0.0,0


In [40]:
df_games.sample(5, random_state=5)

Unnamed: 0,genres,price,early_access,id,release_year,publisher,app_name,title,developer
9793,Simulation,4.99,False,368230,2015,Raw Fury,Kingdom: Classic,Kingdom: Classic,"Noio,Licorice"
29858,Simulation,2.99,False,637802,2017,No data available,Rocksmith® 2014 Edition – Remastered – Alice i...,Rocksmith® 2014 Edition – Remastered – Alice i...,Ubisoft - San Francisco
41055,RPG,2.99,False,608200,2017,No data available,Fantasy Grounds - Mini-Dungeon #025: The Choke...,Fantasy Grounds - Mini-Dungeon #025: The Choke...,"SmiteWorks USA, LLC"
42178,Action,4.99,False,416530,2017,Cherry Pie Games,Tacopocalypse,Tacopocalypse,Cherry Pie Games
3952,Racing,4.99,False,279520,2014,Plug In Digital,Rage Runner,Rage Runner,Hypercane Studios


In [41]:
'''Column id is renamed to item_id'''
new_name = 'item_id'
df_games = df_games.rename(columns={'id': new_name})

In [42]:
df_games.columns

Index(['genres', 'price', 'early_access', 'item_id', 'release_year',
       'publisher', 'app_name', 'title', 'developer'],
      dtype='object')

# **Australian_users_items_clean Preparation.**

In [44]:
with open(r'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\clean_dataset\\Australian_users_items_clean.csv', 'r', encoding='utf-8') as file:
    csv_file = csv.DictReader(file)
    df_items = pd.DataFrame(csv_file)

In [45]:
check_data_type(df_items)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'str'>],100.0,0.0,0
1,item_name,[<class 'str'>],100.0,0.0,0
2,playtime_forever,[<class 'str'>],100.0,0.0,0
3,steam_id,[<class 'str'>],100.0,0.0,0
4,items_count,[<class 'str'>],100.0,0.0,0
5,user_id,[<class 'str'>],100.0,0.0,0
6,user_url,[<class 'str'>],100.0,0.0,0


In [46]:
df_items.sample(5, random_state=5)

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url
712918,233720,Surgeon Simulator,355,76561198069744476,101,bl00dlock,http://steamcommunity.com/id/bl00dlock
4648151,205790,Dota 2 Test,0,76561198067869898,93,76561198067869898,http://steamcommunity.com/profiles/76561198067...
2757740,285580,ACE - Arena: Cyber Evolution,0,76561198050861063,187,76561198050861063,http://steamcommunity.com/profiles/76561198050...
713376,32420,STAR WARS™: The Clone Wars - Republic Heroes™,0,76561198083573232,256,BlackTheKing,http://steamcommunity.com/id/BlackTheKing
3733342,250380,Knock-knock,0,76561197973612806,661,gbl_scarface,http://steamcommunity.com/id/gbl_scarface
