In [42]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datetime import datetime
from PIL import features
import numpy as np

import sys
import os


current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir, '..'))

import ETL.utils as ut

In [46]:
# Load datasets

steam_games_df = pd.read_csv('../data/generated/steam_games.csv',dtype={'id': str})
items_df = pd.read_csv('../data/generated/items.csv', dtype={'item_id': str})
reviews_df = pd.read_csv('../data/generated/reviews_sentiment.csv', dtype={'item_id': str})


In [47]:
columnas_object = steam_games_df.select_dtypes(include=['object']).columns
steam_games_df[columnas_object] = steam_games_df[columnas_object].astype(str)

columnas_object = items_df.select_dtypes(include=['object']).columns
items_df[columnas_object] = items_df[columnas_object].astype(str)

columnas_object = reviews_df.select_dtypes(include=['object']).columns
reviews_df[columnas_object] = reviews_df[columnas_object].astype(str)

### Creation of specific dataframes for the queries."

#### PlayTimeGenre

In [None]:
# Aux function
def parse_date(date_str):
    if date_str == 'Not specified':
        return np.nan  # or any other placeholder value you prefer
    try:
        return datetime.datetime.strptime(date_str, '%Y-%m-%d')
    except ValueError:
        try:
            return datetime.datetime.strptime(date_str, '%b %Y')
        except ValueError:
            return np.nan  # or any other placeholder value

In [None]:
# Aux function
def process_genres(row):
    if isinstance(row['genres'], list):
        return row['genres']
    elif isinstance(row['genres'], str):
        return [genre.strip(" '[]") for genre in row['genres'].split(',')]
    else:
        return [row['genres']]

In [122]:

# Step 1: Merge DataFrames based on the "id" and "item_id" keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Step 2: Explore genre lists and expand them into rows
merged_df = merged_df.explode('genres')

# Step 3: Apply the parse_date function to the 'release_date' column
merged_df['release_date'] = merged_df['release_date'].apply(parse_date)

# Step 4: Filter rows with positive playtime
merged_df = merged_df[merged_df['playtime_forever'] > 0]

# Apply the function to each row
merged_df['genres'] = merged_df.apply(process_genres, axis=1)

# Step 6: Explode the DataFrame again to have one row for each genre
merged_df = merged_df.explode('genres')

# Step 7: Group by genre and year, and calculate the maximum sum of playtime
play_time_genre_df = merged_df.groupby(['genres', merged_df['release_date'].dt.year])['playtime_forever'].max().reset_index()

# Step 8: Rename columns as per your requirements
play_time_genre_df.columns = ['genre', 'year', 'max_playtime_hours']
play_time_genre_df["year"] = play_time_genre_df["year"].astype(int)

In [123]:
ut.data_overview(play_time_genre_df)


Total rows:  189

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,genre,[<class 'str'>],100.0,189,0.0,0
1,year,[<class 'int'>],100.0,189,0.0,0
2,max_playtime_hours,[<class 'float'>],100.0,189,0.0,0


In [124]:
play_time_genre_df['genre']

0                Action
1                Action
2                Action
3                Action
4                Action
             ...       
184            Strategy
185            Strategy
186           Utilities
187           Utilities
188    Video Production
Name: genre, Length: 189, dtype: object

In [125]:
path = r'../simplified-data/'
play_time_genre_df.to_csv(path + 'play_time_genre.csv', index=False)

#### UserFoGenre

In [131]:
# Step 1: Merge DataFrames based on the "id" and "item_id" keys
merged_df = pd.merge(steam_games_df, items_df, left_on='id', right_on='item_id', how='inner')

# Step 2: Explore genre lists and expand them into rows
merged_df = merged_df.explode('genres')

merged_df['release_date'] = merged_df['release_date'].apply(parse_date)

# Step 3: Drop rows with null values in relevant columns
merged_df = merged_df.dropna(subset=['playtime_forever', 'genres', 'user_id', 'release_date'])

# Step 4: Filter rows with positive playtime
merged_df = merged_df[merged_df['playtime_forever'] > 0]

# Apply the function to each row
merged_df['genres'] = merged_df.apply(process_genres, axis=1)

# Apply the function to each row and create a new column 'processed_genres'
merged_df['processed_genres'] = merged_df.apply(process_genres, axis=1)

# Explode the 'processed_genres' column to create separate rows for each genre
merged_df = merged_df.explode('processed_genres')

# Step 5: Group by processed_genres, user, year, and calculate the sum of playtime
user_for_genre_df = merged_df.groupby(['processed_genres', 'user_id', merged_df['release_date'].dt.year.astype(int)])['playtime_forever'].sum().reset_index()

# Step 6: Find the user with the most playtime for each genre and year
user_for_genre_df = user_for_genre_df.loc[user_for_genre_df.groupby(['processed_genres', 'release_date'])['playtime_forever'].idxmax()]

# Rename columns as per your requirements
user_for_genre_df.columns = ['genres', 'user', 'year', 'playtime_forever']
user_for_genre_df["playtime_forever"] = user_for_genre_df["playtime_forever"].astype(int)

In [132]:
ut.data_overview(user_for_genre_df)


Total rows:  189

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,genres,[<class 'str'>],100.0,189,0.0,0
1,user,[<class 'str'>],100.0,189,0.0,0
2,year,[<class 'int'>],100.0,189,0.0,0
3,playtime_forever,[<class 'int'>],100.0,189,0.0,0


In [133]:
user_for_genre_df['genres']

5311               Action
8419               Action
11801              Action
16248              Action
6400               Action
               ...       
86741            Strategy
86794            Strategy
87548           Utilities
87547           Utilities
87550    Video Production
Name: genres, Length: 189, dtype: object

In [134]:
user_for_genre_df.iloc[0]

genres                         Action
user                76561198041356854
year                             1990
playtime_forever                 1424
Name: 5311, dtype: object

In [135]:
path = r'../simplified-data/'
user_for_genre_df.to_csv(path + 'user_for_genre.csv', index=False)

#### UsersRecommend

In [None]:
# Filter only positive/neutral recommendations in reviews_df

filtered_reviews_df = reviews_df[(reviews_df['recommend'] == True) & (reviews_df['sentiment_analysis'].isin([1, 2]))]

# Merge dataframes using the filtered dataframe
merged_df = pd.merge(filtered_reviews_df, items_df, on='item_id')

# Convert the 'posted' column to datetime type
for index, row in merged_df.iterrows():
    try:
        # Try to convert the date using the specified format
        posted_date = datetime.strptime(row['posted'], 'Posted %B %d, %Y.')
        year = posted_date.year
    except ValueError:
        # If there is an error, assign a value indicating that the year is not specified
        year = 'Year not specified'
    
    # Assign the value of 'year' to the new column in the DataFrame
    merged_df.at[index, 'year'] = year

# Get the top 3 most recommended games per year using nlargest
top3_df = merged_df.groupby(['year', 'item_id']).size().groupby('year', group_keys=False).nlargest(3).reset_index(name='recommendations_count')


In [None]:
ut.data_overview(top3_df)

In [None]:
top3_df.head

In [None]:
top3_df['year'] = pd.to_numeric(top3_df['year'], errors='coerce').astype('Int64')

In [None]:
top3_df = top3_df.dropna(subset=['year'])
top3_df.head

In [None]:
path = r'../simplified-data/'
top3_df.to_csv(path + 'users_recommend.csv', index=False)

#### UsersWorstDeveloper

In [24]:
# Filtrar revisiones
filtered_reviews_df = reviews_df[(reviews_df['recommend'] == False) & (reviews_df['sentiment_analysis'] == 0)]

# Fusionar los marcos de datos usando la columna 'item_id'
merged_df = pd.merge(filtered_reviews_df, items_df, on='item_id')

# Agregar la columna 'developer' de steam_games_df
merged_df = pd.merge(merged_df, steam_games_df[['id', 'developer']], left_on='item_id', right_on='id', how='left')

# Convertir la columna 'posted' a tipo datetime y extraer el año
merged_df['year'] = pd.to_datetime(merged_df['posted'], format='Posted %B %d, %Y.', errors='coerce').dt.year

# Filtrar las revisiones negativas no recomendadas con análisis de sentimiento igual a 0
negative_reviews_df = merged_df[(merged_df['recommend'] == False) & (merged_df['sentiment_analysis'] == 0)]

# Asegurarse de que el año sea un número entero
negative_reviews_df['year'] = negative_reviews_df['year'].astype('Int64')

# Obtener el top 3 de desarrolladoras con los tres juegos MENOS recomendados por año
top3_least_recommended_by_year = negative_reviews_df.groupby(['year', 'developer', 'item_id']).size().groupby(['year', 'developer'], group_keys=False).nsmallest(3).reset_index(name='least_recommended_count')


In [25]:
ut.data_overview(top3_least_recommended_by_year)


Total rows:  219

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,year,[<class 'int'>],100.0,219,0.0,0
1,developer,[<class 'str'>],100.0,219,0.0,0
2,item_id,[<class 'str'>],100.0,219,0.0,0
3,least_recommended_count,[<class 'int'>],100.0,219,0.0,0


In [26]:
top3_least_recommended_by_year

Unnamed: 0,year,developer,item_id,least_recommended_count
0,2011,Ubisoft Montpellier,33460,3
1,2013,Avalanche Studios,8190,151
2,2013,Bohemia Interactive,221100,16
3,2013,Chucklefish,211820,11
4,2013,Creative Assembly,214950,18
...,...,...,...,...
214,2015,Valve,620,438
215,2015,Valve,20,756
216,2015,Valve,730,17887
217,2015,Wild Shadow Studios,200210,142


#### sentiment_analysis

In [31]:
# Combinar DataFrames
merged_df = pd.merge(steam_games_df, reviews_df, left_on='id', right_on='item_id')

# Crear columnas para cada categoría de sentimiento
sentiment_columns = ['Negative', 'Neutral', 'Positive']
for sentiment in sentiment_columns:
    merged_df[sentiment] = (merged_df['sentiment_analysis'] == sentiment_columns.index(sentiment)).astype(int)

# Agrupar por desarrollador y sumar los conteos
sentiment_analysis_df = merged_df.groupby('developer')[sentiment_columns].sum().reset_index()

In [32]:
ut.data_overview(sentiment_analysis_df)


Total rows:  1457

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,developer,[<class 'str'>],100.0,1457,0.0,0
1,Negative,[<class 'int'>],100.0,1457,0.0,0
2,Neutral,[<class 'int'>],100.0,1457,0.0,0
3,Positive,[<class 'int'>],100.0,1457,0.0,0


In [33]:
path = r'../simplified-data/'
sentiment_analysis_df.to_csv(path + 'sentiment_analysis.csv', index=False)

#### recomendacion_juego

In [136]:
desired_columns = ["genres", "tags", "specs","id","app_name"]
game_recomendation_df = steam_games_df[desired_columns].copy()

In [137]:
path = r'../simplified-data/'
game_recomendation_df.to_csv(path + 'game_recomendation.csv', index=False)