In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
from time import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.models.phrases import Phrases, Phraser
import string

In [2]:
# Function to reduce the memory usage of a DataFrame.
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Generator function to load data in chunks.
def data_generator(df, chunksize=10000):
    for i in range(0, df.shape[0], chunksize):
        yield df.iloc[i:i+chunksize]

In [3]:
df = reduce_memory(pd.read_csv("data/Dataset.csv"))

In [4]:
df.head()

Unnamed: 0,UserID,Game,purchase/play,Heure_jouee,AppID,Release date,Estimated owners,Peak CCU,Required age,Price,...,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Score,Recommandable,GameID
0,1,The Elder Scrolls V Skyrim,play,273.0,72850,"Nov 10, 2011",5000000 - 10000000,4383,17,19.99,...,50.533333,14.633333,Bethesda Game Studios,Bethesda Softworks,"Single-player,Steam Achievements,Steam Trading...",RPG,"Open World,RPG,Fantasy,Adventure,Dragons,Modda...",5.0,True,1
1,2,The Elder Scrolls V Skyrim,play,58.0,72850,"Nov 10, 2011",5000000 - 10000000,4383,17,19.99,...,50.533333,14.633333,Bethesda Game Studios,Bethesda Softworks,"Single-player,Steam Achievements,Steam Trading...",RPG,"Open World,RPG,Fantasy,Adventure,Dragons,Modda...",5.0,True,1
2,3,The Elder Scrolls V Skyrim,play,110.0,72850,"Nov 10, 2011",5000000 - 10000000,4383,17,19.99,...,50.533333,14.633333,Bethesda Game Studios,Bethesda Softworks,"Single-player,Steam Achievements,Steam Trading...",RPG,"Open World,RPG,Fantasy,Adventure,Dragons,Modda...",5.0,True,1
3,4,The Elder Scrolls V Skyrim,play,465.0,72850,"Nov 10, 2011",5000000 - 10000000,4383,17,19.99,...,50.533333,14.633333,Bethesda Game Studios,Bethesda Softworks,"Single-player,Steam Achievements,Steam Trading...",RPG,"Open World,RPG,Fantasy,Adventure,Dragons,Modda...",5.0,True,1
4,5,The Elder Scrolls V Skyrim,play,220.0,72850,"Nov 10, 2011",5000000 - 10000000,4383,17,19.99,...,50.533333,14.633333,Bethesda Game Studios,Bethesda Softworks,"Single-player,Steam Achievements,Steam Trading...",RPG,"Open World,RPG,Fantasy,Adventure,Dragons,Modda...",5.0,True,1


In [5]:
from scipy.sparse import coo_matrix

# map each user and item to a unique numeric value
user_ids = df['UserID'].astype('category').cat.codes
item_ids = df['GameID'].astype('category').cat.codes

# Get the unique user and game ids
unique_user_ids = df['UserID'].astype('category').cat.categories
unique_item_ids = df['GameID'].astype('category').cat.categories

# create a sparse matrix
user_game_matrix = coo_matrix((df['Heure_jouee'], (user_ids, item_ids)))

# Fit the model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_game_matrix)

# Get top 5 recommendations for first user
distances, indices = model_knn.kneighbors(user_game_matrix.getrow(0), n_neighbors=6)
recommended_users = [unique_user_ids[i] for i in indices.flatten()[1:]]
print(f'Recommended users for the first user are: {recommended_users}')

Recommended users for the first user are: [332, 338, 598, 370, 140]


In [6]:
## On analyse le UserID n°1

# df[df["UserID"] == 1]

## L'algo nous indique que les UserID les plus proches sont les 5 suivants

# df[df["UserID"] == 332]
# df[df["UserID"] == 338]
# df[df["UserID"] == 598]
# df[df["UserID"] == 370]
# df[df["UserID"] == 140]

In [7]:
df_w2v = df[['Game', 'About the game']].drop_duplicates()
df_w2v

Unnamed: 0,Game,About the game
0,The Elder Scrolls V Skyrim,EPIC FANTASY REBORN The next chapter in the hi...
677,Fallout 4,"Bethesda Game Studios, the award-winning creat..."
844,Fallout New Vegas,Welcome to Vegas. New Vegas. It’s the kind of ...
1131,Left 4 Dead 2,"Set in the zombie apocalypse, Left 4 Dead 2 (L..."
1932,HuniePop,"HuniePop is a unique sim experience for PC, Ma..."
...,...,...
56784,The Bug Butcher,The Bug Butcher is an action-packed 2D side sc...
56785,Romance of the Three Kingdoms Maker,■ Characteristics of 'Romance of the Three Kin...
56786,Life is Hard,Life is Hard is a godsim town simulator game. ...
56787,Executive Assault,Executive Assault is an indie real-time strate...


In [8]:
# Prétraitement de la colonne 'About the game'
def preprocess_description(description):
    if pd.isna(description):  # Vérifier si la valeur est NaN
        return ""
    description = description.lower()
    description = description.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(description)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]
    return tokens
    # return ' '.join(tokens)

df_w2v['About the game'] = df_w2v['About the game'].apply(lambda x: preprocess_description(x))

# Construction des bigrams
# phrases = Phrases(df_w2v['About the game'])
# bigram = Phraser(phrases)
# df_w2v['About the game'] = df_w2v['About the game'].apply(lambda x: bigram[x])

# Réassemblage des tokens en texte
df_w2v['About the game'] = df_w2v['About the game'].apply(lambda x: ' '.join(x))

df_w2v.head()


Unnamed: 0,Game,About the game
0,The Elder Scrolls V Skyrim,epic fantasy reborn next chapter highly antici...
677,Fallout 4,bethesda game studio awardwinning creator fall...
844,Fallout New Vegas,welcome vega new vega ’ kind town dig grave pr...
1131,Left 4 Dead 2,set zombie apocalypse left 4 dead 2 l4d2 highl...
1932,HuniePop,huniepop unique sim experience pc mac linux ga...


In [53]:
# Entraînement du modèle Word2Vec
tokenized_descriptions = [word_tokenize(desc) for desc in df_w2v['About the game']]
w2v_model = Word2Vec(
    sentences=tokenized_descriptions,
    vector_size=200,
    window=4,
    min_count=5,
    workers=8,
    batch_words=2600,
    negative=15,
    epochs=25
)

In [54]:
# Fonction pour obtenir les jeux recommandés
def get_recommendations(input_title, df_w2v, w2v_model, top_n=5):
    # Récupérer la description du jeu d'entrée
    input_description = df_w2v[df_w2v['Game'] == input_title]['About the game'].values[0]

    # Tokenization et bigrams
    input_tokens = word_tokenize(input_description)
    # input_tokens = bigram[input_tokens]

    # Supprimer les mots absents du vocabulaire du modèle
    input_tokens = [token for token in input_tokens if token in w2v_model.wv.key_to_index]

    # Calculer les similarités avec toutes les autres descriptions
    similarities = {}
    for idx, row in df_w2v.iterrows():
        other_title = row['Game']
        other_description = row['About the game']

        # Exclure le jeu d'entrée
        if other_title == input_title:
            continue

        # Tokenization et bigrams
        other_tokens = word_tokenize(other_description)
        # other_tokens = bigram[other_tokens]

        # Supprimer les mots absents du vocabulaire du modèle
        other_tokens = [token for token in other_tokens if token in w2v_model.wv.key_to_index]

        # Calculer la similarité avec le jeu d'entrée
        if input_tokens and other_tokens:
            similarity = w2v_model.wv.n_similarity(input_tokens, other_tokens)
            similarities[other_title] = similarity

    # Sélection des jeux les plus similaires
    similar_titles = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return similar_titles

In [55]:
# input_title = "The Elder Scrolls V Skyrim"
# input_title = "Call of Duty Black Ops"
input_title = "Fallout 4"
recommendations = get_recommendations(input_title, df_w2v, w2v_model)

print(f"Jeux recommandés pour {input_title}:")
for title, similarity in recommendations:
    print(f"{title} - Similarité : {similarity}")

Jeux recommandés pour Fallout 4:
Darksiders II Deathinitive Edition - Similarité : 0.851898193359375
Fallout 3 - Similarité : 0.8201559782028198
Krater - Similarité : 0.819952666759491
Aura Kingdom - Similarité : 0.8099674582481384
Prime World - Similarité : 0.8075505495071411
