In [145]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datetime import datetime
from PIL import features
import numpy as np
from ast import literal_eval

import sys
import os


current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir, '..'))

import ETL.utils as ut

In [7]:
# Load datasets

steam_games_df = pd.read_csv('../data/generated/steam_games.csv',dtype={'id': str})
items_df = pd.read_csv('../data/generated/items.csv', dtype={'item_id': str})
reviews_df = pd.read_csv('../data/generated/reviews_sentiment.csv', dtype={'item_id': str})


In [8]:
columnas_object = steam_games_df.select_dtypes(include=['object']).columns
steam_games_df[columnas_object] = steam_games_df[columnas_object].astype(str)

columnas_object = items_df.select_dtypes(include=['object']).columns
items_df[columnas_object] = items_df[columnas_object].astype(str)

columnas_object = reviews_df.select_dtypes(include=['object']).columns
reviews_df[columnas_object] = reviews_df[columnas_object].astype(str)

### Creation of specific dataframes for the queries."

#### developer

In [24]:
# Convert 'release_date' to datetime format, ignoring "Not specified" values
steam_games_df['release_date'] = pd.to_datetime(steam_games_df['release_date'], errors='coerce')

# Drop rows with NaT values (corresponding to "Not specified")
steam_games_df = steam_games_df.dropna(subset=['release_date'])

# Extract the year and create a new 'release_year' column
steam_games_df['release_year'] = steam_games_df['release_date'].dt.year

# Merge dataframes using "id" and "item_id" as keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Group by developer and calculate item quantity and free content percentage
developer_df = merged_df.groupby(['developer', 'release_year']).agg({
    'item_id': 'count',        # Item quantity
    'price': lambda x: (x == 0).mean()  # Calculate the percentage of free content
}).reset_index()

# Rename columns
developer_df.columns = ['developer', 'year', 'items_quantity', 'free_content_percentage']

# Format the percentage as a string with the percentage symbol
developer_df['free_content_percentage'] = developer_df['free_content_percentage'].map(lambda x: f"{x:.2%}")

In [25]:
path = r'../simplified-data/'
developer_df.to_csv(path + 'developer_df.csv', index=False)

#### userdata (New)

def userdata( User_id : str ): Debe devolver cantidad de dinero gastado por el usuario, el porcentaje de recomendación en base a reviews.recommend y cantidad de items.

In [5]:
# Merge DataFrames
merged_df = pd.merge(items_df, reviews_df, on='item_id')
merged_df = pd.merge(merged_df, steam_games_df, left_on='item_id', right_on='id')

# Add columns to the final DataFrame
userdata_df = pd.merge(merged_df.groupby('user_id')['recommend'].mean().reset_index(), 
                    merged_df.groupby('user_id')['price'].sum().reset_index(),
                    on='user_id')
userdata_df.rename(columns={'price': 'spent_money', 'recommend': 'recommendation_percentage'}, inplace=True)

# Add the column for the quantity of items per user
userdata_df['items quantity'] = items_df.groupby('user_id')['item_id'].count().reset_index()['item_id']

: 

In [None]:
path = r'../simplified-data/'
userdata_df.to_csv(path + 'userdata_df.csv', index=False)

#### UserFoGenre

In [148]:
# Expand the DataFrame for each genre
expanded_df = steam_games_df.explode('genres')

# Reset the indices if necessary
expanded_df = expanded_df.reset_index(drop=True)

expanded_df.rename(columns={'genres': 'genre'}, inplace=True)

expanded_df.head(3)


Unnamed: 0,publisher,genre,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,False,761140,Kotoshiro,2018
1,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,False,761140,Kotoshiro,2018
2,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,False,761140,Kotoshiro,2018


In [156]:
# Combine the DataFrames using the "id" column in steam_games_df and "item_id" in items_df
merged_df = pd.merge(expanded_df, items_df, left_on='id', right_on='item_id')

# Select the necessary columns for the new DataFrame
selected_columns = ['user_id', 'genre', 'id', 'release_date', 'playtime_forever']

# Create the final DataFrame with the selected columns
user_for_genre_df = merged_df[selected_columns]



In [51]:
ut.data_overview(user_for_genre_df)


Total rows:  189

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,genres,[<class 'str'>],100.0,189,0.0,0
1,user_id,[<class 'str'>],100.0,189,0.0,0
2,release_date,[<class 'int'>],100.0,189,0.0,0
3,playtime_forever,[<class 'float'>],100.0,189,0.0,0


In [44]:
user_for_genre_df['genres']

5311               Action
8419               Action
11800              Action
16247              Action
6400               Action
               ...       
86713            Strategy
86766            Strategy
87520           Utilities
87519           Utilities
87522    Video Production
Name: genres, Length: 189, dtype: object

In [45]:
user_for_genre_df.iloc[0]

genres                         Action
user_id             76561198041356854
release_date                     1990
playtime_forever               1424.0
Name: 5311, dtype: object

In [157]:
path = r'../simplified-data/'
user_for_genre_df.to_csv(path + 'user_for_genre.csv', index=False)

#### best_developer_year (New)

def best_developer_year( año : int ): Devuelve el top 3 de desarrolladores con juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos)

In [52]:
# Filtrar revisiones recomendadas con análisis de sentimientos igual a 2
recommended_reviews_df = reviews_df[(reviews_df['recommend'] == True) & (reviews_df['sentiment_analysis'] == 2)]

# Unir dataframes usando la columna 'item_id'
merged_df_recommended = pd.merge(recommended_reviews_df, items_df, on='item_id')

# Agregar la columna 'developer' de steam_games_df
merged_df_recommended = pd.merge(merged_df_recommended, steam_games_df[['id', 'developer']], left_on='item_id', right_on='id', how='left')

# Convertir la columna 'posted' a datetime y extraer el año
merged_df_recommended['posted'] = pd.to_datetime(merged_df_recommended['posted'], errors='coerce')
merged_df_recommended['year'] = merged_df_recommended['posted'].dt.year.where(merged_df_recommended['posted'].notnull(), 'Year not specified')

# Asegurarse de que el año sea un entero
merged_df_recommended['year'] = merged_df_recommended['year'].astype('Int64')

# Obtener las tres principales desarrolladoras con más juegos recomendados por año
top3_most_recommended_by_year = merged_df_recommended.groupby(['year', 'developer', 'item_id']).size().groupby(['year', 'developer'], group_keys=False).nlargest(3).reset_index(name='most_recommended_count')


In [53]:
ut.data_overview(top3_most_recommended_by_year)


Total rows:  221

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,year,[<class 'int'>],100.0,221,0.0,0
1,developer,[<class 'str'>],100.0,221,0.0,0
2,item_id,[<class 'str'>],100.0,221,0.0,0
3,least_recommended_count,[<class 'int'>],100.0,221,0.0,0


In [54]:
top3_most_recommended_by_year

Unnamed: 0,year,developer,item_id,least_recommended_count
0,2011,ubisoft montpellier,33460,3
1,2013,avalanche studios,8190,151
2,2013,bohemia interactive,221100,16
3,2013,chucklefish,211820,11
4,2013,creative assembly,214950,18
...,...,...,...,...
216,2015,valve,620,438
217,2015,valve,20,756
218,2015,valve,730,17887
219,2015,wild shadow studios,200210,213


In [None]:
path = r'../simplified-data/'
top3_most_recommended_by_year.to_csv(path + 'best_developer_year.csv', index=False)

#### developer_reviews_analysis

In [55]:
# Combinar DataFrames
merged_df = pd.merge(steam_games_df, reviews_df, left_on='id', right_on='item_id')

# Crear columnas para cada categoría de sentimiento
sentiment_columns = ['Negative', 'Neutral', 'Positive']
for sentiment in sentiment_columns:
    merged_df[sentiment] = (merged_df['sentiment_analysis'] == sentiment_columns.index(sentiment)).astype(int)

# Agrupar por desarrollador y sumar los conteos
sentiment_analysis_df = merged_df.groupby('developer')[sentiment_columns].sum().reset_index()

In [56]:
ut.data_overview(sentiment_analysis_df)


Total rows:  1172

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,developer,[<class 'str'>],100.0,1172,0.0,0
1,Negative,[<class 'int'>],100.0,1172,0.0,0
2,Neutral,[<class 'int'>],100.0,1172,0.0,0
3,Positive,[<class 'int'>],100.0,1172,0.0,0


In [57]:
path = r'../simplified-data/'
sentiment_analysis_df.to_csv(path + 'sentiment_analysis.csv', index=False)

#### recomendacion_juego

In [58]:
desired_columns = ["genres", "tags", "specs","id","app_name"]
game_recomendation_df = steam_games_df[desired_columns].copy()

In [59]:
path = r'../simplified-data/'
game_recomendation_df.to_csv(path + 'game_recomendation.csv', index=False)