# **1. Import the required libraries.**

In [3]:
'''Necessary libraries.'''
import pandas as pd                 # Pandas for tabular data manipulation.
import json                         # Module for working with JSON.
import ast                          # Module for evaluating Python literal expressions.
import re                           # Module for working with regular expressions.
from textblob import TextBlob       # I import TextBlob from the textblob library.
import nltk                         # Natural Language Toolkit.
import csv                          # I import the CSV module into Python.

'''Enable auto-reload of modules before executing a cell'''
%load_ext autoreload
%autoreload 2

'''Import the warning module and set it to ignore all warnings'''
import warnings
warnings.filterwarnings("ignore")

# **2. Auxiliary Functions**

In [4]:
def check_data_type(df):
    
    '''A dictionary (my_dict) is created with five keys: "field_name", "data_type", "non_null_%", "null_%" and "nulls". 
    These keys will be used to store information about each column of the DataFrame.'''
    
    my_dict = {"field_name" : [], "data_type" : [], "non_null_%" : [], "null_%" : [], "nulls" : []}
    
    
    '''A loop is performed over all the columns of the DataFrame df'''
    for column in df.columns:
        percentage_non_nulls = (df[column].count() / len(df) * 100)     # The percentage of non-null values in the current column is calculated 
        my_dict['field_name'].append(column)                            # The current column is added to the list under the key 'field_name'        
        my_dict['data_type'].append(df[column].apply(type).unique())    # Obtain the unique data type in the current column and add it to the list under the key 'data type'.
        my_dict['non_null_%'].append(round(percentage_non_nulls, 2))    # Add the percentage of non-null values to the list under the key 'non_null_%'.
        my_dict['null_%'].append(round(100 - percentage_non_nulls, 2))  # Add the percentage of null values to the list under the key 'null_%'.
        my_dict['nulls'].append(df[column].isnull().sum())              # Add the number of null values in the current column to the list under the key 'nulls'.
        
    '''The dictionari my_dict is used to create a new DataFrame called df_info.'''
    df_info = pd.DataFrame(my_dict)
        
    '''The function returns the DataFrame df_info containing information about each column,
    incluiding the column name, data type, percentage of non-null values, percentage of null values, and the number of null values.'''
    return df_info

# **3. Functions.**

**1.  def developer( developer : str):**

In [23]:
'''csv_file_path'''
csv_file_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_1.csv'

'''Read the CSV file and convert it to a DataFrame'''
df_endpoint_1 = pd.read_csv(csv_file_path)

df_endpoint_1.sample(2, random_state=5)

Unnamed: 0,developer,release_year,total_items,free_items,percentage_free
4741,Flying Wild Hog,2016,8,0,0.00%
7149,L. Stotch,2016,12,0,0.00%


In [24]:
    
def developer(dataframe, developer):
  """Filtra el DataFrame por el desarrollador dado y calcula estadísticas de artículos y artículos gratuitos."""

  df_developer = dataframe[dataframe['developer'] == developer]

  stats_for_year = df_developer.groupby('release_year').agg({
      'free_items': 'sum',
      'total_items': 'sum',
      'percentage_free': lambda x: (
          # Extrae solo la parte numérica (elimina el '%' final)
          pd.to_numeric(x.str.split('%').str[0], errors='coerce')
      ).mean()  # Calcula el porcentaje promedio libre para cada año
  }).reset_index()

  stats_for_year['percentage_free'] = stats_for_year['percentage_free'].round(2)

  stats_for_year = stats_for_year.rename(columns={'release_year': 'Año'})
  stats_for_year = stats_for_year.rename(columns={'total_items': 'Artículos'})
  stats_for_year = stats_for_year.rename(columns={'percentage_free': '% Gratis'})

  return stats_for_year[['Año', 'Artículos', '% Gratis']]




In [25]:
developer(df_endpoint_1,'Valve')


Unnamed: 0,Año,Artículos,% Gratis
0,1998,1,0.0
1,1999,1,0.0
2,2000,2,0.0
3,2001,1,0.0
4,2003,1,0.0
5,2004,5,0.0
6,2005,1,0.0
7,2006,2,0.0
8,2007,4,0.0
9,2008,1,0.0


**2.  def userdata( user_id : str):**

In [26]:
'''csv_file_path'''
csv_file_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_2.csv'

'''Read the CSV file and convert it to a DataFrame'''
df_endpoint_2 = pd.read_csv(csv_file_path)

df_endpoint_2.sample(2, random_state=5)

Unnamed: 0,developer,item_id,release_year,user_id,total_amount_spent,percentage_true
81344,"Psyonix, Inc.",252950,2015,76561197964772156,2358.82,100.00%
246754,Hothead Games,18000,2008,murdadar,1869.66,100.00%


In [29]:

def userdata(df, user_id):
    '''Filter the DataFrame to get user information'''
    user = df[df['user_id'] == user_id]

    if user.empty:
        return f"The user {user_id} does not exist in the DataFrame."

    '''Get the first value of the Series (since there should be only one user)'''
    spent_money = user['total_amount_spent'].iloc[0]
    recommendation_percentage = float(user['percentage_true'].iloc[0].rstrip('%'))
    number_items = user['item_id'].nunique()

    '''create return dictionary'''
    result = {
        "user": user_id,
        "Spent money": f"${spent_money:.2f} USD",
        "% de recommendation": f"{recommendation_percentage:.2f}%",
        "Number of items": number_items
    }

    return result


In [30]:
userdata(df_endpoint_2,'GamekungX')

{'user': 'GamekungX',
 'Spent money': '$2099.16 USD',
 '% de recommendation': '100.00%',
 'Number of items': 2}

**3.  def UserForRenge( genre : str):**

In [31]:
'''csv_file_path'''
csv_file_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_3.csv'

'''Read the CSV file and convert it to a DataFrame'''
df_endpoint_3 = pd.read_csv(csv_file_path)

df_endpoint_3.sample(2, random_state=5)

Unnamed: 0,item_id,playtime_forever,user_id,genres,release_year
87970,105600,2639,Foddzy,RPG,2011
443420,206500,1236,Stalker669,Strategy,2012


In [32]:
check_data_type(df_endpoint_3)

Unnamed: 0,field_name,data_type,non_null_%,null_%,nulls
0,item_id,[<class 'int'>],100.0,0.0,0
1,playtime_forever,[<class 'int'>],100.0,0.0,0
2,user_id,[<class 'str'>],100.0,0.0,0
3,genres,[<class 'str'>],100.0,0.0,0
4,release_year,[<class 'str'>],100.0,0.0,0


In [33]:
def UserForGenre(genre: str, df):
    '''Convert "release_year" column to integer data type.'''
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce', downcast='integer')

    '''Filter the DataFrame by the given gender.'''
    genre_df = df[df['genres'] == genre]

    '''# Convert the 'playtime_forever' column to hours (rounded to whole numbers).'''
    genre_df['playtime_forever'] = (genre_df['playtime_forever'] / 60 / 60).astype(int)

    '''Find the user with the most hours played for the given genre'''
    max_playtime_user = genre_df.loc[genre_df['playtime_forever'].idxmax(), 'user_id']

    '''Group by year and add hours played'''
    yearly_playtime = genre_df.groupby('release_year')['playtime_forever'].sum().reset_index()

    '''Create the list of dictionaries for return'''
    playtime_list = [{'Year': int(year), 'Hours': int(hours)} for year, hours in zip(yearly_playtime['release_year'], yearly_playtime['playtime_forever'])]

    '''Create the return dictionary'''
    result = {"User with the most hours played for gender " + genre: max_playtime_user, "Hours played": playtime_list}

    return result

In [34]:
UserForGenre('Indie', df_endpoint_3)


{'User with the most hours played for gender Indie': 'thiefofrosesinlalaland',
 'Hours played': [{'Year': 1988, 'Hours': 0},
  {'Year': 1995, 'Hours': 0},
  {'Year': 1996, 'Hours': 0},
  {'Year': 1997, 'Hours': 0},
  {'Year': 1998, 'Hours': 0},
  {'Year': 1999, 'Hours': 0},
  {'Year': 2000, 'Hours': 0},
  {'Year': 2001, 'Hours': 0},
  {'Year': 2002, 'Hours': 0},
  {'Year': 2003, 'Hours': 1},
  {'Year': 2004, 'Hours': 0},
  {'Year': 2005, 'Hours': 0},
  {'Year': 2006, 'Hours': 3243},
  {'Year': 2007, 'Hours': 5},
  {'Year': 2008, 'Hours': 23},
  {'Year': 2009, 'Hours': 7},
  {'Year': 2010, 'Hours': 6},
  {'Year': 2011, 'Hours': 3280},
  {'Year': 2012, 'Hours': 860},
  {'Year': 2013, 'Hours': 1908},
  {'Year': 2014, 'Hours': 272},
  {'Year': 2015, 'Hours': 1526},
  {'Year': 2016, 'Hours': 665},
  {'Year': 2017, 'Hours': 1634},
  {'Year': 2018, 'Hours': 0}]}

**4.  def best_developer_year( year : str):**

In [35]:
'''csv_file_path'''
csv_file_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_4.csv'

'''Read the CSV file and convert it to a DataFrame'''
df_endpoint_4 = pd.read_csv(csv_file_path)

df_endpoint_4.sample(2, random_state=5)

Unnamed: 0,release_year,sentiment_analysis,developer
45540,2016,1,Tripwire Interactive
65401,2015,1,Free Lives


In [36]:
def best_developer_year(dataframe, year):
    '''Filter by year ordered'''
    df_year = dataframe[dataframe['release_year'] == year]

    '''Filter by values ​​equal to 2 in the sentiment analysis column'''
    df_filtered = df_year[df_year['sentiment_analysis'] == 2]

    '''Group by developer and add sentiment_analysis column'''
    df_grouped = df_filtered.groupby('developer')['sentiment_analysis'].sum().reset_index()

    '''Sort the DataFrame by the summed column in descending order'''
    df_sorted = df_grouped.sort_values(by='sentiment_analysis', ascending=False)

    '''Get the first three developers and their sums'''
    top_developers = df_sorted.head(3)

    '''Create the return format'''
    result = [{"Position {}: {}".format(i+1, row['developer']): row['sentiment_analysis']} for i, (_, row) in enumerate(top_developers.iterrows())]

    return result

In [37]:
best_developer_year(df_endpoint_4, 2017)


[{'Position 1: Smartly Dressed Games': 2850},
 {'Position 2: Freejam': 1176},
 {'Position 3: Studio Wildcard,Instinct Games,Efecto Studios,Virtual Basement LLC': 644}]

**5.  def developer_reviews_analysis( developer : str):**

In [38]:
'''csv_file_path'''
csv_file_path = 'C:\\Users\\migue\\Optimizing Recommender Systems with an Advanced MLOps Pipeline\\Datasets\\dataset_endpoint_5.csv'

'''Read the CSV file and convert it to a DataFrame'''
df_endpoint_5 = pd.read_csv(csv_file_path)

df_endpoint_5.sample(2, random_state=5)

Unnamed: 0,sentiment_analysis,developer
19375,2,Valve
27713,0,"Expansive Worlds,Avalanche Studios"


In [39]:

def developer_reviews_analysis(df, developer):
    '''Filter data for the developer provided'''
    filtered_data = df[df['developer'] == developer]

    '''Initialize counters'''
    positive_count = 0
    negative_count = 0

    '''Count positive and negative reviews'''
    for sentiment in filtered_data['sentiment_analysis']:
        if sentiment == 0:
            negative_count += 1
        elif sentiment == 2:
            positive_count += 1

    '''Create the results dictionary with the desired format'''
    result = {developer: [f"Negative = {negative_count}", f"Positive = {positive_count}"]}

    return result

In [40]:
developer_reviews_analysis(df_endpoint_5,'Valve')


{'Valve': ['Negative = 969', 'Positive = 4817']}