In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datetime import datetime
from PIL import features
import numpy as np

import sys
import os


current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir, '..'))

import ETL.utils as ut

In [2]:
# Load datasets

steam_games_df = pd.read_csv('../data/generated/steam_games.csv',dtype={'id': str})
items_df = pd.read_csv('../data/generated/items.csv', dtype={'item_id': str})
reviews_df = pd.read_csv('../data/generated/reviews_sentiment.csv', dtype={'item_id': str})


In [3]:
columnas_object = steam_games_df.select_dtypes(include=['object']).columns
steam_games_df[columnas_object] = steam_games_df[columnas_object].astype(str)

columnas_object = items_df.select_dtypes(include=['object']).columns
items_df[columnas_object] = items_df[columnas_object].astype(str)

columnas_object = reviews_df.select_dtypes(include=['object']).columns
reviews_df[columnas_object] = reviews_df[columnas_object].astype(str)

### Creation of specific dataframes for the queries."

#### developer

In [5]:
# Merge dataframes using "id" and "item_id" as keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Filter free games
free_games_df = merged_df[merged_df['price'] == 0]

# Group by developer and calculate item quantity and free content
developer_df = free_games_df.groupby('developer').agg({
    'release_date': 'first',  # Assuming 'release_date' is the column for the release year
    'item_id': 'count',        # Item quantity
    'price': lambda x: any(x == 0)  # Check if there is free content
}).reset_index()

# Rename columns
developer_df.columns = ['developer', 'year', 'items quantity', 'free content']


In [6]:
path = r'../simplified-data/'
developer_df.to_csv(path + 'developer_df.csv', index=False)

#### userdata (New)

def userdata( User_id : str ): Debe devolver cantidad de dinero gastado por el usuario, el porcentaje de recomendación en base a reviews.recommend y cantidad de items.

In [None]:
# Merge the games and items dataframes
merged_items_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Merge the merged_items_df with the reviews dataframe
final_df = pd.merge(merged_items_df, reviews_df, on='item_id', how='inner')

# Group by user_id and calculate the sum of prices, number of unique items, and mean of recommendations
userdata_df = final_df.groupby('user_id').agg({
    'price': 'sum',
    'item_id': 'nunique',
    'recommend': 'mean'
}).reset_index()

# Rename columns
userdata_df.rename(columns={'price': 'spent money', 'item_id': 'items quantity', 'recommend': 'recommendation percentage'}, inplace=True)

: 

In [None]:
path = r'../simplified-data/'
userdata_df.to_csv(path + 'userdata_df.csv', index=False)

#### UserFoGenre

In [36]:
# Step 1: Merge DataFrames based on the "id" and "item_id" keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Step 2: Explore genre lists and expand them into rows
merged_df = merged_df.explode('genres')

merged_df['release_date'] = merged_df['release_date'].apply(parse_date)

# Step 3: Drop rows with null values in relevant columns
merged_df = merged_df.dropna(subset=['playtime_forever', 'genres', 'user_id', 'release_date'])

# Step 4: Filter rows with positive playtime
merged_df = merged_df[merged_df['playtime_forever'] > 0]

# Apply the function to each row
merged_df['genres'] = merged_df.apply(process_genres, axis=1)

# Apply the function to each row and create a new column 'processed_genres'
merged_df['processed_genres'] = merged_df.apply(process_genres, axis=1)

# Explode the 'processed_genres' column to create separate rows for each genre
merged_df = merged_df.explode('processed_genres')

# Step 5: Group by processed_genres, user, year, and calculate the sum of playtime
user_for_genre_df = merged_df.groupby(['processed_genres', 'user_id', merged_df['release_date'].dt.year.astype(int)])['playtime_forever'].sum().reset_index()

# Step 6: Find the user with the most playtime for each genre and year
user_for_genre_df = user_for_genre_df.loc[user_for_genre_df.groupby(['processed_genres', 'release_date'])['playtime_forever'].idxmax()]

# Rename columns as per your requirements
user_for_genre_df.columns = ['genres', 'user', 'year', 'playtime_forever']
user_for_genre_df["playtime_forever"] = user_for_genre_df["playtime_forever"].astype(int)

In [37]:
ut.data_overview(user_for_genre_df)


Total rows:  189

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,genres,[<class 'str'>],100.0,189,0.0,0
1,user,[<class 'str'>],100.0,189,0.0,0
2,year,[<class 'int'>],100.0,189,0.0,0
3,playtime_forever,[<class 'int'>],100.0,189,0.0,0


In [38]:
user_for_genre_df['genres']

5311               action
8419               action
11800              action
16247              action
6400               action
               ...       
86713            strategy
86766            strategy
87520           utilities
87519           utilities
87522    video production
Name: genres, Length: 189, dtype: object

In [39]:
user_for_genre_df.iloc[0]

genres                         action
user                76561198041356854
year                             1990
playtime_forever                 1424
Name: 5311, dtype: object

In [40]:
path = r'../simplified-data/'
user_for_genre_df.to_csv(path + 'user_for_genre.csv', index=False)

#### best_developer_year (New)

def best_developer_year( año : int ): Devuelve el top 3 de desarrolladores con juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos)

In [52]:
# Filtrar revisiones recomendadas con análisis de sentimientos igual a 2
recommended_reviews_df = reviews_df[(reviews_df['recommend'] == True) & (reviews_df['sentiment_analysis'] == 2)]

# Unir dataframes usando la columna 'item_id'
merged_df_recommended = pd.merge(recommended_reviews_df, items_df, on='item_id')

# Agregar la columna 'developer' de steam_games_df
merged_df_recommended = pd.merge(merged_df_recommended, steam_games_df[['id', 'developer']], left_on='item_id', right_on='id', how='left')

# Convertir la columna 'posted' a datetime y extraer el año
merged_df_recommended['posted'] = pd.to_datetime(merged_df_recommended['posted'], errors='coerce')
merged_df_recommended['year'] = merged_df_recommended['posted'].dt.year.where(merged_df_recommended['posted'].notnull(), 'Year not specified')

# Asegurarse de que el año sea un entero
merged_df_recommended['year'] = merged_df_recommended['year'].astype('Int64')

# Obtener las tres principales desarrolladoras con más juegos recomendados por año
top3_most_recommended_by_year = merged_df_recommended.groupby(['year', 'developer', 'item_id']).size().groupby(['year', 'developer'], group_keys=False).nlargest(3).reset_index(name='most_recommended_count')


In [53]:
ut.data_overview(top3_most_recommended_by_year)


Total rows:  221

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,year,[<class 'int'>],100.0,221,0.0,0
1,developer,[<class 'str'>],100.0,221,0.0,0
2,item_id,[<class 'str'>],100.0,221,0.0,0
3,least_recommended_count,[<class 'int'>],100.0,221,0.0,0


In [54]:
top3_most_recommended_by_year

Unnamed: 0,year,developer,item_id,least_recommended_count
0,2011,ubisoft montpellier,33460,3
1,2013,avalanche studios,8190,151
2,2013,bohemia interactive,221100,16
3,2013,chucklefish,211820,11
4,2013,creative assembly,214950,18
...,...,...,...,...
216,2015,valve,620,438
217,2015,valve,20,756
218,2015,valve,730,17887
219,2015,wild shadow studios,200210,213


In [None]:
path = r'../simplified-data/'
top3_most_recommended_by_year.to_csv(path + 'best_developer_year.csv', index=False)

#### developer_reviews_analysis

In [55]:
# Combinar DataFrames
merged_df = pd.merge(steam_games_df, reviews_df, left_on='id', right_on='item_id')

# Crear columnas para cada categoría de sentimiento
sentiment_columns = ['Negative', 'Neutral', 'Positive']
for sentiment in sentiment_columns:
    merged_df[sentiment] = (merged_df['sentiment_analysis'] == sentiment_columns.index(sentiment)).astype(int)

# Agrupar por desarrollador y sumar los conteos
sentiment_analysis_df = merged_df.groupby('developer')[sentiment_columns].sum().reset_index()

In [56]:
ut.data_overview(sentiment_analysis_df)


Total rows:  1172

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,developer,[<class 'str'>],100.0,1172,0.0,0
1,Negative,[<class 'int'>],100.0,1172,0.0,0
2,Neutral,[<class 'int'>],100.0,1172,0.0,0
3,Positive,[<class 'int'>],100.0,1172,0.0,0


In [57]:
path = r'../simplified-data/'
sentiment_analysis_df.to_csv(path + 'sentiment_analysis.csv', index=False)

#### recomendacion_juego

In [58]:
desired_columns = ["genres", "tags", "specs","id","app_name"]
game_recomendation_df = steam_games_df[desired_columns].copy()

In [59]:
path = r'../simplified-data/'
game_recomendation_df.to_csv(path + 'game_recomendation.csv', index=False)