In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data understanding

https://www.kaggle.com/datasets/gregorut/videogamesales

In [49]:
df = pd.read_csv('./data/vgsales.csv')

# keeping only the needed features
df = df[['Name', 'Platform', 'Genre', 'Publisher']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       16598 non-null  object
 1   Platform   16598 non-null  object
 2   Genre      16598 non-null  object
 3   Publisher  16540 non-null  object
dtypes: object(4)
memory usage: 518.8+ KB


In [50]:
df.nunique()

Name         11493
Platform        31
Genre           12
Publisher      578
dtype: int64

In [51]:
# checking for null values
df.isna().sum()

Name          0
Platform      0
Genre         0
Publisher    58
dtype: int64

In [52]:
df[df.isna().any(axis=1)]

Unnamed: 0,Name,Platform,Genre,Publisher
470,wwe Smackdown vs. Raw 2006,PS2,Fighting,
1303,Triple Play 99,PS,Sports,
1662,Shrek / Shrek 2 2-in-1 Gameboy Advance Video,GBA,Misc,
2222,Bentley's Hackpack,GBA,Misc,
3159,Nicktoons Collection: Game Boy Advance Video V...,GBA,Misc,
3166,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,Misc,
3766,SpongeBob SquarePants: Game Boy Advance Video ...,GBA,Misc,
4145,Sonic the Hedgehog,PS3,Platform,
4526,The Fairly Odd Parents: Game Boy Advance Video...,GBA,Misc,
4635,The Fairly Odd Parents: Game Boy Advance Video...,GBA,Misc,


games with null values on Publisher column, they have 'Game Boy Advance Video Volume' in the name, so they are not actually games, those are tv shows sold to be played on the gameboy console, will drop those

## Data Preparation
**Cleaning**

In [53]:
# droping null values
df.dropna(inplace=True)

In [59]:
df

Unnamed: 0,Name,Platform,Genre,Publisher,combined_features
0,Wii Sports,Wii,Sports,Nintendo,Wii Sports Nintendo
1,Super Mario Bros.,NES,Platform,Nintendo,NES Platform Nintendo
2,Mario Kart Wii,Wii,Racing,Nintendo,Wii Racing Nintendo
3,Wii Sports Resort,Wii,Sports,Nintendo,Wii Sports Nintendo
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo,GB Role-Playing Nintendo
...,...,...,...,...,...
16593,Woody Woodpecker in Crazy Castle 5,GBA,Platform,Kemco,GBA Platform Kemco
16594,Men in Black II: Alien Escape,GC,Shooter,Infogrames,GC Shooter Infogrames
16595,SCORE International Baja 1000: The Official Game,PS2,Racing,Activision,PS2 Racing Activision
16596,Know How 2,DS,Puzzle,7G//AMES,DS Puzzle 7G//AMES


In [55]:
def combined_features(row):
    return row['Platform']+" "+row['Genre']+" "+row['Publisher']
df["combined_features"] = df.apply(combined_features, axis =1)

In [57]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [58]:
cosine_sim = cosine_similarity(count_matrix)

In [72]:
games_user_likes = "Mario Kart Wii"
def get_index_from_title(title):
    return df[df.Name == title].index.values[0]
games_index = get_index_from_title(games_user_likes)

In [73]:
similar_games = list(enumerate(cosine_sim[games_index]))

In [74]:
sorted_similar_games = sorted(similar_games, key=lambda x:x[1], reverse=True)

In [75]:
def get_title_from_index(index):
    return df[df.index == index]["Name"].values[0]
i=0
for game in sorted_similar_games:
    print(get_title_from_index(game[0]))
    i=i+1
    if i>15:
        break

Mario Kart Wii
LEGO Dimensions
Tiger Woods PGA Tour 08
One Piece Unlimited Cruise 1: The Treasure Beneath the Waves
Wii Sports
Wii Sports Resort
Wii Play
New Super Mario Bros. Wii
Mario Kart DS
Wii Fit
Wii Fit Plus
Super Smash Bros. Brawl
Mario Kart 7
Super Mario Galaxy
Mario Kart 64
Super Mario Kart
