In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import string
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/martin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('details.csv')
df_avis = pd.read_csv('avis.csv')

In [4]:
stop_words = set(stopwords.words('french'))

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text, language='french')
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['processed_description'] = df['description'].apply(preprocess)

In [6]:
# Feature Extraction
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed_description'])

# Similarity Measure
cosine_sim_description = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
# Split the 'categories' column and explode it into separate rows
split_categories = df['categories'].str.split(' | ').explode()

# Remove leading/trailing whitespace and get unique categories
unique_categories = split_categories.str.strip().unique()

# Convert the result into a list
unique_category_list = list(unique_categories)

In [8]:
# Assuming df is your DataFrame and it has a 'categories' column
df['categories'] = df['categories'].fillna('No Category').apply(lambda x: x.split('|'))

# Explode the categories into separate rows for each category per game
split_categories = df.explode('categories')

# Remove leading/trailing whitespace in categories
split_categories['categories'] = split_categories['categories'].str.strip()

# Get unique categories (no need to explicitly convert to list for MultiLabelBinarizer)
unique_categories = split_categories['categories'].unique()

In [9]:
# Reset 'categories' column to lists for MultiLabelBinarizer compatibility
df['categories'] = df['categories'].apply(lambda categories: [category.strip() for category in categories])

# Initialize and fit the MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categories_matrix = mlb.fit_transform(df['categories'])

# Create a DataFrame for the encoded categories
categories_df = pd.DataFrame(categories_matrix, columns=mlb.classes_)


# Calculate cosine similarity matrix
cosine_sim_categories = cosine_similarity(categories_df)

In [11]:
def hybrid_recommend(title, df, category_cosine_sim, description_cosine_sim, weights={'category': 0.5, 'description': 0.5}):
    # Assume both similarity matrices are aligned and have the same shape
    idx = df.index[df['full_title'] == title].tolist()[0]
    
    # Get similarity scores from both systems
    category_scores = category_cosine_sim[idx]
    description_scores = description_cosine_sim[idx]
    
    # Combine scores
    combined_scores = weights['category'] * category_scores + weights['description'] * description_scores
    
    # Rank games based on combined scores
    ranked_indices = combined_scores.argsort()[::-1][1:11]  # Exclude the first one (the game itself)
    
    # Return the top 10 recommended titles
    return df['full_title'].iloc[ranked_indices]

In [19]:
weights={'category':0.4, 'description':0.6}

In [20]:
hybrid_recommend('Babel (2013)',df,cosine_sim_categories, cosine_sim_description,weights)

5699                   Babel (2013)
7004             Tetris Link (2012)
16326                   Gods (2001)
5716              Antique II (2014)
2554              Antique II (2014)
18522    Alexander the Great (2005)
14689           Gloria Mundi (2006)
12716              Beim Zeus (1997)
17537                Arkaios (2004)
16003                 Angkor (2005)
Name: full_title, dtype: object