In [1]:
import gzip
import ast
import pandas as pd
from textblob import TextBlob
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import sys

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
with gzip.open('steam_games.json.gz', 'rb') as f:
    steam_raw_df = pd.read_json(f, lines=True)

In [4]:
def alt_load(route):
    df=[]
    with gzip.open(route, 'rb') as f:
        file = f.read().decode('utf-8')
        for line in file.split('\n'):
            if line:
                df.append(ast.literal_eval(line))
    return pd.DataFrame(df)

In [5]:
reviews_raw_df = alt_load('user_reviews.json.gz')   

In [6]:
items_raw_df = alt_load('users_items.json.gz')

In [122]:
steam_df = steam_raw_df.dropna(how='all')

In [8]:
reviews_df = reviews_raw_df['reviews'].to_frame()

In [9]:
reviews_df = reviews_df.explode('reviews').apply(pd.Series)

In [10]:
reviews_df = pd.json_normalize(reviews_df['reviews'])

In [11]:
reviews_df = reviews_df.drop(columns=['funny','last_edited','helpful'])

In [12]:
reviews_df.rename(columns={'posted':'date_str'},inplace=True)

In [13]:
reviews_df.dropna(how='all',inplace=True)

In [14]:
def analyze_sentiment(review):
    analysis = TextBlob(review)
    polarity = analysis.sentiment.polarity
    if polarity < 0:
        return 0  
    elif polarity == 0:
        return 1  
    else:
        return 2  

In [15]:
reviews_df['sentiment_analysis'] = reviews_df['review'].apply(lambda x: analyze_sentiment(str(x)))

In [16]:
reviews_df['recommend'] = reviews_df['recommend'].astype(int)

In [17]:
reviews_df['date_str'] = reviews_df['date_str'].str.replace('Posted ', '')

In [18]:
reviews_df['date'] = pd.to_datetime(reviews_df['date_str'], format='%B %d, %Y.', errors='coerce')


In [19]:
reviews_df['date'] = reviews_df['date'].fillna(pd.to_datetime(reviews_df[reviews_df['date'].isnull()]['date_str'] + str(2016),format='%B %d.%Y'))


In [20]:
reviews_df.drop(columns={'date_str','review'},inplace=True)

In [21]:
reviews_df.reset_index(inplace=True)

In [22]:
reviews_df.drop(columns={'index'},inplace=True)

In [123]:
steam_df.drop(columns={'publisher','title','url','reviews_url','price','early_access','developer'},inplace=True)

In [124]:
steam_df.set_index('id', inplace=True)


In [125]:
steam_df.dropna(how='all',inplace=True)

In [126]:
steam_df['date'] = pd.to_datetime(steam_df['release_date'],format='mixed', errors='coerce')

In [127]:
steam_df.drop(columns=['release_date'],inplace=True)

In [128]:
steam_df.rename(columns={'app_name':'name'},inplace=True)

In [129]:
steam_df[['tags','genres','specs']]=steam_df[['tags','genres','specs']].map(lambda x: x if isinstance(x,list) else [])

In [130]:
ml_df = steam_df.copy()

In [131]:
ml_df['genres']=steam_df.apply(lambda row: row['genres'] + row['tags'] + row['specs'], axis=1)

In [132]:
ml_df['genres']=ml_df['genres'].apply(lambda x: list(set(x)))

In [133]:
ml_df.rename(columns={'genres':'attributes'},inplace=True)

In [134]:
ml_df.dropna(subset=['name'],inplace=True)

In [141]:
ml_df =ml_df[~ml_df.index.duplicated()]

In [138]:
steam_df.drop(columns={'tags','specs'},inplace=True)

In [139]:
ml_df.drop(columns={'tags','specs'},inplace=True)

In [36]:
items_df = items_raw_df.drop(columns={'items_count','user_url','steam_id'})

In [37]:
items_df = items_df.explode('items').apply(pd.Series)

In [38]:
items_df.reset_index(inplace=True)

In [39]:
items_df.drop(columns=['index'],inplace=True)

In [40]:
items_df = pd.json_normalize(items_df['items']).join(items_df['user_id'])

In [41]:
items_df.dropna(subset=['item_id'],inplace=True)

In [42]:
items_df['item_id'] = items_df['item_id'].astype(int)

In [43]:
items_df.drop(columns={'playtime_2weeks'},inplace=True)

In [44]:
items_df.rename(columns={'playtime_forever':'playtime'},inplace=True)

Funcion 1

In [117]:
def PlayTimeGenre(genero: str,steam_games,user_items):
    steam_games = steam_games[steam_games['genres'].apply(lambda x: genero in x)]
    user_items = user_items.groupby('item_id')['playtime'].sum()
    merged_df = steam_games.merge(user_items, how='left', left_index=True, right_index=True)
    merged_df['playtime'] = merged_df['playtime'].fillna(0)
    merged_df['date'] = pd.to_datetime(merged_df['date']).dt.year
    merged_df = merged_df.groupby('date')['playtime'].sum().reset_index()
    merged_df = merged_df.sort_values(by='playtime', ascending=False)
    most_played_year = int(merged_df.iloc[0]['date'])
    
    return f"Año de lanzamiento con más horas jugadas para el género {genero} es el: {most_played_year}"

In [118]:
play_time = {}
for x in steam_df['genres'].explode('lista_columna').unique():
    try:
        play_time[x.lower()]=PlayTimeGenre(x,steam_df[['genres','date']],items_df[['item_id','playtime']]) 
    except IndexError:
        pass 

In [119]:
pd.DataFrame(list(play_time.items()), columns=['genre', 'return']).set_index('genre').to_parquet('playtimegenre.parquet')

Funcion 2

In [120]:
def UserForGenre(genero: str,steam_games,user_items):
    steam_games = steam_games[steam_games['genres'].apply(lambda x: genero in x)]
    steam_games['date'] = pd.to_datetime(steam_games['date']).dt.year
    indices = steam_games.index
    user_items = user_items[user_items['item_id'].isin(indices)]
    user_top = user_items.groupby('user_id')['playtime'].sum().reset_index().sort_values(by='playtime', ascending=False)
    user_top_id = user_top.iloc[0]['user_id']
    user = user_items[user_items['user_id'] == user_top_id]
    user = user.set_index('item_id').merge(steam_games.drop(columns=['genres']), how='left', left_index=True, right_index=True)
    user = user.drop(columns=[ 'user_id']).groupby('date')['playtime'].sum().reset_index()
    user = user[user['playtime'] > 0]
    texto = f'Usuario con más horas jugadas para el género {genero}: {user_top_id}. Horas jugadas por año de lanzamiento: '
    for index, row in user.iterrows():
        texto += f'Año {int(row["date"])}: {round(int(row["playtime"])/60,1)} hora(s). '

    return texto

In [121]:
user_genre = {}
for x in steam_df['genres'].explode('lista_columna').unique():
    try:
        user_genre[x.lower()]=UserForGenre(x,steam_df[['genres','date']],items_df[['user_id','item_id','playtime']]) 
    except IndexError:
        pass 

In [122]:
pd.DataFrame(list(user_genre.items()), columns=['genre', 'return']).set_index('genre').to_parquet('userforgenre.parquet')

Funcion 3

In [123]:
def UsersRecommend( year : int,user_reviews,steam_games ):
    df = user_reviews[(user_reviews['date'].dt.year == year) & 
                      (user_reviews['sentiment_analysis'] > 0)].drop(columns=['date','sentiment_analysis'])
    df = df.groupby('item_id')['recommend'].sum().reset_index()
    df = df.sort_values(by='recommend', ascending=False)
    df = df.head(3)
    ids = df['item_id'].values
    names =[]
    for id in ids:
        id = int(id)
        names.append(steam_games.loc[id]['name'])
    return 'Puesto 1: '+names[0]+', Puesto 2: '+names[1]+', Puesto 3: '+names[2]

In [124]:
users_recommend ={}
for x in reviews_df['date'].dt.year.unique():
    users_recommend[x] = UsersRecommend(x,reviews_df,steam_df)

In [125]:
pd.DataFrame(list(users_recommend.items()), columns=[ 'year','return']).set_index('year').to_parquet('usersrecommend.parquet')

Funcion 4

In [126]:
def UsersNotRecommend( year : int,user_reviews,steam_games ):
    df = user_reviews[(user_reviews['date'].dt.year == year) & 
                      (user_reviews['sentiment_analysis'] == 0)&
                      (user_reviews['recommend'] == 0) ].drop(columns=['recommend','date','sentiment_analysis'])
    df = df['item_id'].value_counts().head(10).reset_index()
    ids = df['item_id'].values
    names =[]
    for id in ids:
        id = int(id)
        try:
            names.append(steam_games.loc[id]['name'])
        except KeyError:
            pass
    return 'Puesto 1: '+names[0]+', Puesto 2: '+names[1]+', Puesto 3: '+names[2]

In [127]:
users_nrecommend ={}
for x in reviews_df['date'].dt.year.unique():
    try:
        users_nrecommend[x] = UsersNotRecommend(x,reviews_df,steam_df)
    except IndexError:
        users_nrecommend[x] = 'No hay juegos no recomendados para este año'

In [128]:
pd.DataFrame(list(users_nrecommend.items()), columns=[ 'year','return']).set_index('year').to_parquet('usersnotrecommend.parquet')

Funcion 5

In [129]:
def sentiment_analysis( year : int,df ):
    df = df[(df['date'].dt.year == year)].drop(columns={'date','recommend','item_id'})
    df = df.value_counts().reset_index().set_index('sentiment_analysis')
    return 'Negativo = '+str(df.loc[0]['count'])+', Neutral = '+str(df.loc[1]['count'])+', Positivo = '+str(df.loc[2]['count'])

In [130]:
sentiment ={}
for x in reviews_df['date'].dt.year.unique():
    sentiment[x] = sentiment_analysis(x,reviews_df)


In [131]:
pd.DataFrame(list(sentiment.items()), columns=[ 'year','return']).set_index('year').to_parquet('sentiment_analysis.parquet')

Modelo de recomendacion Item-Item

In [143]:
vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)


In [144]:
ml_df['attributes'] = ml_df['attributes'].apply(lambda lista: ' '.join(map(str, lista)).lower())

In [145]:
matrix = vectorizer.fit_transform(ml_df['attributes'])




In [146]:
similarity_matrix = cosine_similarity(matrix, matrix)

Funcion 6 

In [147]:
def get_recommendations(game_id, similarity_matrix, df, top_n=5):
    game_index = df.index.get_loc(game_id)
    similar_games_indices = similarity_matrix[game_index].argsort()[::-1][1:top_n+1]
    similar_games = df.iloc[similar_games_indices]
    similar_games = similar_games['name'].values
    texto = 'Juegos recomendados: '
    for name in similar_games:
        texto+= name+'. '
    return texto

In [148]:
recommendations = {}
for x in ml_df.index.tolist():
    recommendations[x] = get_recommendations(x, similarity_matrix, ml_df)


In [154]:
pd.DataFrame(list(recommendations.items()), columns=[ 'id','return']).set_index('id').to_parquet('recommendations.parquet')