USER TO ITEM RECOMMENDATION SYSTEM

In [1]:
import ast
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
model_data = pd.read_csv("data/steam_games.csv", index_col=0, parse_dates=["date"])
model_data['genres'] = model_data['genres'].apply(ast.literal_eval)
model_data.drop(columns="developer", inplace=True)
model_data.drop(columns="url", inplace=True)
model_data

Unnamed: 0,game_id,date,title,price,genres,score,playtime_forever,playtime_2weeks,downloads
0,227380.0,1983-06-19,Dragon's Lair,9.99,"[Adventure, Action, FMV, Casual, Quick-Time Ev...",,3582.0,0.0,103.0
1,240340.0,1984-04-29,Space Ace,9.99,"[Adventure, Casual, Action, Quick-Time Events,...",,384.0,0.0,18.0
2,517930.0,1984-11-01,The Castles of Dr. Creep,1.99,"[Casual, Indie, Puzzle-Platformer, Retro, Puzz...",,,,
3,329660.0,1985-01-01,Silent Service,6.99,"[Simulation, Classic]",,,,
4,664780.0,1986-05-01,Alter Ego,7.99,"[RPG, Casual, Indie, Adventure, Simulation, Te...",,,,
...,...,...,...,...,...,...,...,...,...
31332,755830.0,NaT,Lonely Astronaut,0.00,"[Casual, Action, Indie, Simulation]",,,,
31333,708070.0,NaT,RECHARGE COMPLETE,0.00,"[Early Access, Action, Adventure, Indie]",,,,
31334,250440.0,NaT,Tetrapulse,0.00,"[Indie, Action, Co-op]",,,,
31335,772180.0,NaT,Cricket Club,0.00,"[Early Access, Indie, Casual, Simulation, Sports]",,,,


In [2]:
# Initialize an empty list to store all unique genres
all_genres = []

# Iterate through the 'genres' column of the 'steam_games' DataFrame
for genres in model_data['genres']:
    if genres is not None:
        for genre in genres:
            all_genres.append(genre)

# Create a DataFrame and count how many games belong to each genre
all_genres_df = pd.DataFrame(all_genres, columns=['genre'])
genre_counts = all_genres_df['genre'].value_counts().reset_index()
genre_counts.columns = ['genre', 'count']

# Initialize columns for classifying genres.
model_data['popular_genres'] = None
model_data['common_genres'] = None
model_data['unpopular_genres'] = None

# Iterate through the 'model_data' DataFrame to classify genres
for index, row in model_data.iterrows():
    popular_genres = []
    common_genres = []
    unpopular_genres = []
    if row['genres'] is not None:
        for genre in row['genres']:
            count = genre_counts['count'][genre_counts['genre'] == genre].values[0]
            if count >= 2500:
                popular_genres.append(genre)
            elif count >= 100:
                common_genres.append(genre)
            else:
                unpopular_genres.append(genre)
    # Assign the categorized genre lists to the respective columns
    model_data.at[index, 'popular_genres'] = popular_genres
    model_data.at[index, 'common_genres'] = common_genres
    model_data.at[index, 'unpopular_genres'] = unpopular_genres

model_data.drop(columns="genres", inplace=True)

model_data['popular_genres'] = model_data['popular_genres'].apply(lambda x: ' '.join(x))
model_data['common_genres'] = model_data['common_genres'].apply(lambda x: ' '.join(x))
model_data['unpopular_genres'] = model_data['unpopular_genres'].apply(lambda x: ' '.join(x))

model_data['popular_genres'].fillna('', inplace=True)
model_data['common_genres'].fillna('', inplace=True)
model_data['unpopular_genres'].fillna('', inplace=True)

In [3]:
# Discretise price values
bins = [0, 2, 5, 10, 20, 30, 40, 50, 100, float('inf')]
labels = ["0-2", "2-5", "5-10", "10-20", "20-30", "30-40", "40-50", "50-100", "100+"]
model_data['price_discr'] = pd.cut(model_data['price'], bins=bins, labels=labels)

model_data["price_discr"].replace("0-2", 0, inplace=True)
model_data["price_discr"].replace("2-5", 1, inplace=True)
model_data["price_discr"].replace("5-10", 1, inplace=True)
model_data["price_discr"].replace("10-20", 2, inplace=True)
model_data["price_discr"].replace("20-30", 3, inplace=True)
model_data["price_discr"].replace("30-40", 3, inplace=True)
model_data["price_discr"].replace("40-50", 3, inplace=True)
model_data["price_discr"].replace("50-100", 3, inplace=True)
model_data["price_discr"].replace("100+", 3, inplace=True)

In [6]:
model_data.drop(columns=["playtime_forever", "playtime_2weeks", "downloads", "price"], inplace=True)
model_data["score"].fillna(0, inplace=True)

Unnamed: 0,game_id,date,title,price,score,popular_genres,common_genres,unpopular_genres,price_discr
0,227380.0,1983-06-19,Dragon's Lair,9.99,,Adventure Action Casual,Arcade Classic Retro Difficult Short,FMV Quick-Time Events 1980s Cartoon,1
1,240340.0,1984-04-29,Space Ace,9.99,,Adventure Casual Action,Classic,Quick-Time Events FMV,1
2,517930.0,1984-11-01,The Castles of Dr. Creep,1.99,,Casual Indie,Puzzle-Platformer Retro Puzzle Medieval,,0
3,329660.0,1985-01-01,Silent Service,6.99,,Simulation,Classic,,1
4,664780.0,1986-05-01,Alter Ego,7.99,,RPG Casual Indie Adventure Simulation,Text-Based,,1
...,...,...,...,...,...,...,...,...,...
31332,755830.0,NaT,Lonely Astronaut,0.00,,Casual Action Indie Simulation,,,
31333,708070.0,NaT,RECHARGE COMPLETE,0.00,,Action Adventure Indie,Early Access,,
31334,250440.0,NaT,Tetrapulse,0.00,,Indie Action,Co-op,,
31335,772180.0,NaT,Cricket Club,0.00,,Indie Casual Simulation,Early Access Sports,,


In [7]:
model_data.to_csv("data/model_data.csv")

In [27]:
def recommend_similar_games(title):

    # Find game index
    game_index = model_data[model_data['title'] == title].index[0]

    # Filter games bases on date
    year = model_data["date"].dt.year[game_index]
    five_years_ago = year - 5
    five_years_later = year + 5
    filtered_data = model_data[(model_data["date"].dt.year >= five_years_ago) & (model_data["date"].dt.year <= five_years_later)]

    # Filter games based on price
    price = model_data["price_discr"][game_index]
    upper_price = price + 1
    lower_price = price - 1
    filtered_data = filtered_data[(filtered_data["price_discr"] >= lower_price) & (filtered_data["price_discr"] <= upper_price)]
    
    # Filter games based on score
    score = model_data["score"][game_index]
    upper_score = score + 1
    lower_score = score - 1
    filtered_data = filtered_data[(filtered_data["score"] >= lower_score) & (filtered_data["score"] <= upper_score)]

    # Find new game index
    filtered_data = filtered_data.reset_index(drop=True)
    new_game_index = filtered_data[filtered_data['title'] == title].index[0]

    # Create similarity matrix based on popular genres for filtered_data
    popular_genres_matrix = vectorizer.fit_transform(filtered_data['popular_genres'])
    popular_genres_similarity_matrix = cosine_similarity(popular_genres_matrix, popular_genres_matrix)
    # Create similarity matrix based on common genres for filtered_data
    common_genres_matrix = vectorizer.fit_transform(filtered_data['common_genres'])
    common_genres_similarity_matrix = cosine_similarity(common_genres_matrix, common_genres_matrix)
    # Create similarity matrix based on unpopular genres for filtered_data
    unpopular_genres_matrix = vectorizer.fit_transform(filtered_data['unpopular_genres'])
    unpopular_genres_similarity_matrix = cosine_similarity(unpopular_genres_matrix, unpopular_genres_matrix)
    # Combine the similarity matrices for the three genre categories with choosen importance
    similarity_matrix = popular_genres_similarity_matrix + common_genres_similarity_matrix * 1.5 + unpopular_genres_similarity_matrix * 2

    # Adjust the game indices for the filtered_data
    similar_game_indices = similarity_matrix[new_game_index].argsort()[::-1][1:6]
    # Obtain the names of similar games and their similarity scores
    similar_game_scores = similarity_matrix[new_game_index][similar_game_indices]
    similar_game_titles = filtered_data.iloc[similar_game_indices]['title'].tolist()

    # Create a DataFrame with titles and similarity scores
    similar_games_df = pd.DataFrame({'combined_score': similar_game_scores, 'title': similar_game_titles})

    return similar_games_df

similar_games = recommend_similar_games("Counter-Strike")
similar_games

Unnamed: 0,combined_score,title
0,2.404507,Counter-Strike: Source
1,1.505094,Quake III Arena
2,1.416758,Counter-Strike: Condition Zero
3,1.317101,Commandos: Behind Enemy Lines
4,1.303151,STAR WARS™ Republic Commando™
