In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import SteamVariables as sv
from sklearn.utils import shuffle
import nltk
import sys
from nltk.sentiment import SentimentIntensityAnalyzer
from importlib import reload
##nltk download
nltk.download('vader_lexicon')
reload(sv)

df = pd.read_csv(sv.CSV_PATH, nrows=800000)
df = df.dropna()

# sys.stdout.reconfigure(encoding='utf-8')

languages = ["bulgarian", "croatian", "danish", "czech", "slovak", "slovenian", "slovak", "slovenian", 
"spanish", "estonian", "finnish", "french", "greek", "hungarian", "irish", "italian", 
"latvian", "lithuanian", "maltese", "dutch", "polish", "portuguese", "romanian", "swedish", "english", "brazilian"]

# Eliminar linhas com linguagem diferente das selecionadas
df = df[df[sv.LANGUAGE].isin(languages)]

def sentiment(review):
    sid = SentimentIntensityAnalyzer()
    score = sid.polarity_scores(review)
    return score

df = shuffle(df,random_state=60)

print(df[sv.APP_NAME])

sample_dt = df.sample(n=50000)
df = pd.DataFrame(sample_dt)

df["review_score"] = df[sv.REVIEW].apply(lambda x: sentiment(x))

feature_columnsNew= [sv.AUTHOR_STEAMID,sv.APP_NAME, sv.REVIEW_SCORE]

new_dt = df[feature_columnsNew]
print(new_dt)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Miguel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


368718    The Witcher 3: Wild Hunt
742994                    Portal 2
465576    The Witcher 3: Wild Hunt
763279                    Portal 2
266150    The Witcher 3: Wild Hunt
                    ...           
798756                    Portal 2
372958    The Witcher 3: Wild Hunt
528019      Counter-Strike: Source
486706                   Half-Life
738996                    Portal 2
Name: app_name, Length: 430838, dtype: object
           author.steamid                  app_name  \
439354  76561197992038430  The Witcher 3: Wild Hunt   
492805  76561198161735596                 Half-Life   
10563   76561198439276675  The Witcher 3: Wild Hunt   
68642   76561198848912211  The Witcher 3: Wild Hunt   
763384  76561198378147992                  Portal 2   
...                   ...                       ...   
516034  76561198120992221                 Half-Life   
647962  76561198392831930  Half-Life 2: Episode Two   
512671  76561198117134220                 Half-Life   
794244  76561198308

In [4]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import SVD
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate

# df[sv.REVIEW_SCORE] = df[sv.REVIEW_SCORE].apply(lambda x: x["compound"])

# Função para converter os valores
def converter_valor(valor):
    if isinstance(valor, dict):
        compound_score = valor.get("compound", 0)
        if compound_score < -0.5:
            return 1
        elif -0.5 <= compound_score < 0:
            return 2
        elif 0 <= compound_score < 0.4:
            return 3
        elif 0.4 <= compound_score < 0.8:
            return 4
        elif compound_score >= 0.8:
            return 5
    return 0

# Aplicar a função na coluna e criar uma nova coluna com os valores convertidos
df['review_score'] = df['review_score'].apply(converter_valor)

feature_columnsNew= [sv.AUTHOR_STEAMID,sv.APP_NAME, "review_score"]
new_dt = df[feature_columnsNew]

reader = Reader(rating_scale=(0, 1))
print(new_dt)
data= Dataset.load_from_df(new_dt,reader=reader)
trainSet, testSet = train_test_split(data,test_size=0.2,random_state=60)
algo = SVD()
print(data)
algo.fit(trainSet)
predictions = algo.test(testSet)
accuracy.rmse(predictions)
accuracy.mae(predictions)
cross_validate(algo,data,measures=['RMSE','MAE'],cv = 5, verbose=True)


           author.steamid                  app_name  review_score
439354  76561197992038430  The Witcher 3: Wild Hunt             5
492805  76561198161735596                 Half-Life             3
10563   76561198439276675  The Witcher 3: Wild Hunt             3
68642   76561198848912211  The Witcher 3: Wild Hunt             4
763384  76561198378147992                  Portal 2             3
...                   ...                       ...           ...
516034  76561198120992221                 Half-Life             3
647962  76561198392831930  Half-Life 2: Episode Two             3
512671  76561198117134220                 Half-Life             3
794244  76561198308143172                  Portal 2             3
766379  76561198360809390                  Portal 2             2

[50000 rows x 3 columns]
<surprise.dataset.DatasetAutoFolds object at 0x0000022722067FD0>
RMSE: 2.7313
MAE:  2.5450
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fol

{'test_rmse': array([2.74588783, 2.72759968, 2.71746205, 2.72360056, 2.7278563 ]),
 'test_mae': array([2.5657, 2.5444, 2.5338, 2.545 , 2.5454]),
 'fit_time': (0.3551919460296631,
  0.3228931427001953,
  0.3293416500091553,
  0.3867363929748535,
  0.3535933494567871),
 'test_time': (0.030050039291381836,
  0.029897212982177734,
  0.03346419334411621,
  0.037116289138793945,
  0.03396725654602051)}

In [5]:
pred = algo.predict(uid=72,iid=322110)
print(pred)

user: 72         item: 322110     r_ui = None   est = 1.00   {'was_impossible': False}


In [6]:
import difflib
import random



def get_game_id(book_title, metadata):
    
    """
    Gets the game ID for a game title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata[sv.APP_NAME].values)
    closest_titles = difflib.get_close_matches(book_title, existing_titles)
    book_id = metadata[metadata[sv.APP_NAME] == closest_titles[0]][sv.APP_ID].values[0]
    return book_id

def get_game_info(book_id, metadata):
    
    """
    Returns some basic information about a book given the book id and the metadata dataframe.
    """
    
    book_info = metadata[metadata[sv.APP_ID] == book_id][[sv.APP_ID, sv.APP_NAME]]

    game_info = book_info["app_name"].values[0]
    
    return game_info

def predict_review(user_id, book_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific book. 
    """
    
    book_id = get_game_id(book_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=book_id)
    return review_prediction.est

def generate_recommendation(user_id, model, metadata, thresh=0.15):
    jogos = []
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    books with a predicted rating at or above the threshold will be recommended
    """
    
    lista_app = list(df['app_name'].unique())
    random.shuffle(lista_app)
    
    i = 0
    for book_title in lista_app:
        rating = predict_review(user_id, book_title, model, metadata)       

        if rating >= thresh:
            book_id = get_game_id(book_title, metadata)
            jogos.append(get_game_info(book_id, metadata))
            i += 1
        if i == 3:
            return jogos
    return jogos


In [11]:

print(generate_recommendation(76561198120121330,algo,df))

['Counter-Strike: Source', 'The Witcher 3: Wild Hunt', 'Half-Life 2: Episode Two']
