In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datetime import datetime
from PIL import features
import numpy as np

import sys
import os


current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir, '..'))

import ETL.utils as ut

In [3]:
# Load datasets

steam_games_df = pd.read_csv('../data/generated/steam_games.csv',dtype={'id': str})
items_df = pd.read_csv('../data/generated/items.csv', dtype={'item_id': str})
reviews_df = pd.read_csv('../data/generated/reviews_sentiment.csv', dtype={'item_id': str})


In [4]:
columnas_object = steam_games_df.select_dtypes(include=['object']).columns
steam_games_df[columnas_object] = steam_games_df[columnas_object].astype(str)

columnas_object = items_df.select_dtypes(include=['object']).columns
items_df[columnas_object] = items_df[columnas_object].astype(str)

columnas_object = reviews_df.select_dtypes(include=['object']).columns
reviews_df[columnas_object] = reviews_df[columnas_object].astype(str)

### Creation of specific dataframes for the queries."

#### developer

In [26]:
# Merge dataframes using "id" and "item_id" as keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Filter free games
free_games_df = merged_df[merged_df['price'] == 0]

# Group by developer and calculate item quantity and free content percentage
developer_df = free_games_df.groupby('developer').agg({
    'release_date': 'first',  # Assuming 'release_date' is the column for the release year
    'item_id': 'count',        # Item quantity
    'price': lambda x: f"{(sum(x == 0) / len(x)) * 100:.0f}%"  # Calculate and format the percentage
}).reset_index()

# Rename columns
developer_df.columns = ['developer', 'year', 'items_quantity', 'free_content_percentage']


In [27]:
path = r'../simplified-data/'
developer_df.to_csv(path + 'developer_df.csv', index=False)

#### userdata (New)

def userdata( User_id : str ): Debe devolver cantidad de dinero gastado por el usuario, el porcentaje de recomendación en base a reviews.recommend y cantidad de items.

In [5]:
# Merge the games and items dataframes
merged_items_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Merge the merged_items_df with the reviews dataframe
final_df = pd.merge(merged_items_df, reviews_df, on='item_id', how='inner')

# Group by user_id and calculate the sum of prices, number of unique items, and mean of recommendations
userdata_df = final_df.groupby('user_id').agg({
    'price': 'sum',
    'item_id': 'count',
    'recommend': lambda x: f"{round(x.mean() * 100)}%" if x.any() else 'no recommendation' 
}).reset_index()

# Rename columns
userdata_df.rename(columns={'price': 'spent_money', 'item_id': 'items_quantity', 'recommend': 'recommendation_percentage'}, inplace=True)

In [6]:
path = r'../simplified-data/'
userdata_df.to_csv(path + 'userdata_df.csv', index=False)

#### UserFoGenre

In [7]:
def parse_date(date_str):
    try:
        # Try to parse the date with the full format
        return datetime.strptime(date_str, 'Posted %B %d, %Y.')
    except ValueError:
        try:
            # Try parsing without the year
            return datetime.strptime(date_str, 'Posted %B %d.').replace(year=datetime.now().year)
        except ValueError:
            # In case of error, return a date with day 1
            return datetime(datetime.now().year, 1, 1)

In [8]:
def process_genres(row):
    if isinstance(row['genres'], list):
        return row['genres']
    elif isinstance(row['genres'], str):
        return [genre.strip(" '[]") for genre in row['genres'].split(',')]
    else:
        return [row['genres']]

In [9]:
# Step 1: Merge DataFrames based on the "id" and "item_id" keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Step 2: Explore genre lists and expand them into rows
merged_df = merged_df.explode('genres')

merged_df['release_date'] = merged_df['release_date'].apply(parse_date)

# Step 3: Drop rows with null values in relevant columns
merged_df = merged_df.dropna(subset=['playtime_forever', 'genres', 'user_id', 'release_date'])

# Step 4: Filter rows with positive playtime
merged_df = merged_df[merged_df['playtime_forever'] > 0]

# Apply the function to each row
merged_df['genres'] = merged_df.apply(process_genres, axis=1)

# Apply the function to each row and create a new column 'processed_genres'
merged_df['processed_genres'] = merged_df.apply(process_genres, axis=1)

# Explode the 'processed_genres' column to create separate rows for each genre
merged_df = merged_df.explode('processed_genres')

# Step 5: Group by processed_genres, user, year, and calculate the sum of playtime
user_for_genre_df = merged_df.groupby(['processed_genres', 'user_id', merged_df['release_date'].dt.year.astype(int)])['playtime_forever'].sum().reset_index()

# Step 6: Find the user with the most playtime for each genre and year
user_for_genre_df = user_for_genre_df.loc[user_for_genre_df.groupby(['processed_genres', 'release_date'])['playtime_forever'].idxmax()]

# Rename columns as per your requirements
user_for_genre_df.columns = ['genres', 'user', 'year', 'playtime_forever']
user_for_genre_df["playtime_forever"] = user_for_genre_df["playtime_forever"].astype(int)

In [10]:
ut.data_overview(user_for_genre_df)


Total rows:  16

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,genres,[<class 'str'>],100.0,16,0.0,0
1,user,[<class 'str'>],100.0,16,0.0,0
2,year,[<class 'int'>],100.0,16,0.0,0
3,playtime_forever,[<class 'int'>],100.0,16,0.0,0


In [11]:
user_for_genre_df['genres']

26558                      action
33135                   adventure
36020    animation &amp; modeling
36143                      casual
37082                early access
37264                free to play
42026                       indie
61220       massively multiplayer
61864               not specified
62025                      racing
62714                         rpg
68374                  simulation
84858                      sports
85976                    strategy
87654                   utilities
87656            video production
Name: genres, dtype: object

In [12]:
user_for_genre_df.iloc[0]

genres                       action
user                crustyoldkiller
year                           2024
playtime_forever             388621
Name: 26558, dtype: object

In [13]:
path = r'../simplified-data/'
user_for_genre_df.to_csv(path + 'user_for_genre.csv', index=False)

#### best_developer_year (New)

def best_developer_year( año : int ): Devuelve el top 3 de desarrolladores con juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos)

In [14]:
# Filtrar revisiones recomendadas con análisis de sentimientos igual a 2
recommended_reviews_df = reviews_df[(reviews_df['recommend'] == True) & (reviews_df['sentiment_analysis'] == 2)]

# Unir dataframes usando la columna 'item_id'
merged_df_recommended = pd.merge(recommended_reviews_df, items_df, on='item_id')

# Agregar la columna 'developer' de steam_games_df
merged_df_recommended = pd.merge(merged_df_recommended, steam_games_df[['id', 'developer']], left_on='item_id', right_on='id', how='left')

# Convertir la columna 'posted' a datetime y extraer el año
merged_df_recommended['posted'] = pd.to_datetime(merged_df_recommended['posted'], errors='coerce')
merged_df_recommended['year'] = merged_df_recommended['posted'].dt.year.where(merged_df_recommended['posted'].notnull(), 'Year not specified')

# Asegurarse de que el año sea un entero
merged_df_recommended['year'] = merged_df_recommended['year'].astype('Int64')

# Obtener las tres principales desarrolladoras con más juegos recomendados por año
top3_most_recommended_by_year = merged_df_recommended.groupby(['year', 'developer', 'item_id']).size().groupby(['year', 'developer'], group_keys=False).nlargest(3).reset_index(name='most_recommended_count')


In [15]:
ut.data_overview(top3_most_recommended_by_year)


Total rows:  976

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,year,[<class 'int'>],100.0,976,0.0,0
1,developer,[<class 'str'>],100.0,976,0.0,0
2,item_id,[<class 'str'>],100.0,976,0.0,0
3,most_recommended_count,[<class 'int'>],100.0,976,0.0,0


In [16]:
top3_most_recommended_by_year

Unnamed: 0,year,developer,item_id,most_recommended_count
0,2010,bioware,17450,13
1,2010,obsidian entertainment,22380,66
2,2011,aspyr studios,32500,1
3,2011,avalanche studios,8190,151
4,2011,bethesda game studios,22370,45
...,...,...,...,...
971,2015,x-legend,268420,35
972,2015,xlgames,304030,3
973,2015,yager development,50300,36
974,2015,young horses,224480,3


In [17]:
path = r'../simplified-data/'
top3_most_recommended_by_year.to_csv(path + 'best_developer_year.csv', index=False)

#### developer_reviews_analysis

In [18]:
# Combinar DataFrames
merged_df = pd.merge(steam_games_df, reviews_df, left_on='id', right_on='item_id')

# Crear columnas para cada categoría de sentimiento
sentiment_columns = ['Negative', 'Neutral', 'Positive']
for sentiment in sentiment_columns:
    merged_df[sentiment] = (merged_df['sentiment_analysis'] == sentiment_columns.index(sentiment)).astype(int)

# Agrupar por desarrollador y sumar los conteos
sentiment_analysis_df = merged_df.groupby('developer')[sentiment_columns].sum().reset_index()

In [19]:
ut.data_overview(sentiment_analysis_df)


Total rows:  1172

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,developer,[<class 'str'>],100.0,1172,0.0,0
1,Negative,[<class 'int'>],100.0,1172,0.0,0
2,Neutral,[<class 'int'>],100.0,1172,0.0,0
3,Positive,[<class 'int'>],100.0,1172,0.0,0


In [20]:
path = r'../simplified-data/'
sentiment_analysis_df.to_csv(path + 'sentiment_analysis.csv', index=False)

#### recomendacion_juego

In [21]:
desired_columns = ["genres", "tags", "specs","id","app_name"]
game_recomendation_df = steam_games_df[desired_columns].copy()

In [22]:
path = r'../simplified-data/'
game_recomendation_df.to_csv(path + 'game_recomendation.csv', index=False)