My original dataset, which I webscraped from the same website, consisted of the first 100 pages of the Metacritic All Time Game Scores, ordered by MetaScore.

After finding this more expansive dataset, I've decided to implement a content-based recommender system with this dataset, mostly guided the Data camp tutorial here:
https://www.datacamp.com/community/tutorials/recommender-systems-python

Link to the origin page of the dataset is provided here: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from scipy import stats
from sklearn.metrics.pairwise import linear_kernel # for cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load data

data = pd.read_csv('all_games.csv')

data

#len(data['title'].unique())

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9
...,...,...,...,...,...,...
18795,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
18796,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
18797,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9
18798,Infestation: Survivor Stories (The War Z),PC,"October 15, 2012","(Formerly known as ""The War Z"") It has been 5 ...",20,1.7


In [3]:
# drop duplicates

data.drop_duplicates('name',keep='first', inplace=True)

data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1
...,...,...,...,...,...,...
12249,Charlie's Angels,GameCube,"July 9, 2003","Join Natalie, Dylan, and Alex for an intense a...",23,4.3
12250,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
12251,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
12252,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9


In [9]:
# summary stats

data.describe()

Unnamed: 0,meta_score
count,12254.0
mean,70.743104
std,12.418369
min,20.0
25%,64.0
50%,73.0
75%,80.0
max,99.0


In [10]:
# check for nulls

data.isnull().sum()

# 101 null summary entries

name              0
platform          0
release_date      0
summary         101
meta_score        0
user_review       0
dtype: int64

In [11]:
# drop nulls

null_rows = data[data['summary'].isnull()].index

data.drop(index=null_rows, inplace=True)

data

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1
...,...,...,...,...,...,...
12249,Charlie's Angels,GameCube,"July 9, 2003","Join Natalie, Dylan, and Alex for an intense a...",23,4.3
12250,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
12251,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
12252,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9


In [12]:
# get info about dataframe

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12153 entries, 0 to 12253
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          12153 non-null  object
 1   platform      12153 non-null  object
 2   release_date  12153 non-null  object
 3   summary       12153 non-null  object
 4   meta_score    12153 non-null  int64 
 5   user_review   12153 non-null  object
dtypes: int64(1), object(5)
memory usage: 664.6+ KB


In [14]:
# clean/process corpus

# Function to streamline NLP Process

def nlp(df, text):
    # Load string
    # raw_data = pd.read_csv(file + '.csv')
    
    df['dummy'] = df[text].astype(str)
    # Convert to lowercase
    df['dummy'] = [post.lower() for post in df['dummy']]

    # Word & Sentence Tokenization
    token_post = [word_tokenize(post) for post in df['dummy']]

    #sent_token = [sent_tokenize(post) for post in df['text']]
    
    # Remove Punctuation
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

    no_punc = []

    for filt in token_post:
        review = []
        for token in filt:
            new_token = reg.sub(u'', token)
            if not new_token == u'':
                review.append(new_token)
        no_punc.append(review)
        
    # Remove Stopwords
    no_stop = []

    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)

        no_stop.append(new_term_vector)
        
    # Stemming & Lemmatization
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()

    preproc_text = []

    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))

        preproc_text.append(final_text)
        
    # create final data set
    #data = df.copy()

    new_col = pd.Series(preproc_text)
    df['proc_summary'] = new_col
    df.drop('dummy', axis=1, inplace=True)
    return df

proc_data = nlp(data, 'summary')

In [15]:
proc_data.head(10)

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review,proc_summary
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1,"[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4,"[tale, soul, sword, transcending, world, histo..."
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1,"[metacritic, 2007, wii, game, year, ultimate, ..."
5,Super Mario Galaxy 2,Wii,"May 23, 2010","Super Mario Galaxy 2, the sequel to the galaxy...",97,9.1,"[super, mario, galaxy, 2, sequel, galaxyhoppin..."
6,Red Dead Redemption 2,Xbox One,"October 26, 2018",Developed by the creators of Grand Theft Auto ...,97,8.0,"[developed, creator, grand, theft, auto, v, re..."
7,Grand Theft Auto V,Xbox One,"November 18, 2014",Grand Theft Auto 5 melds storytelling and game...,97,7.9,"[grand, theft, auto, 5, meld, storytelling, ga..."
8,Disco Elysium: The Final Cut,PC,"March 30, 2021",Disco Elysium - The Final Cut is the definitiv...,97,8.3,"[disco, elysium, final, cut, definitive, editi..."
9,The Legend of Zelda: Breath of the Wild,Switch,"March 3, 2017",Forget everything you know about The Legend of...,97,8.7,"[forget, everything, know, legend, zelda, game..."


In [16]:
# reset proc_data index

proc_data.reset_index(drop=True, inplace=True)

proc_data.index

RangeIndex(start=0, stop=12153, step=1)

In [17]:
#Define TF-IDF Vectorizer Object
tfidf = TfidfVectorizer()

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform([str(i) for i in proc_data['proc_summary']])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(12153, 38142)

In [18]:
# list of feature integer indices to feature name

tfidf.get_feature_names()[3000:3050]

['astromonkey',
 'astronaut',
 'astroneer',
 'astroneers',
 'astrong',
 'astronomical',
 'astronomist',
 'astrophysical',
 'astrophysicist',
 'astropop',
 'astrosmash',
 'asuka',
 'asuna',
 'asunder',
 'asura',
 'asw',
 'asylum',
 'asymmetric',
 'asymmetrical',
 'asynchronous',
 'at60framespersecond',
 'atalanta',
 'ataldazar',
 'atamipek',
 'atari',
 'atat',
 'atats',
 'atb',
 'atbat',
 'ateam',
 'atelier',
 'atemporal',
 'aterra',
 'atgs',
 'atheletes',
 'athena',
 'athens',
 'atherton',
 'athlete',
 'athletic',
 'athleticism',
 'athletics',
 'athrun',
 'atkinson',
 'atlanta',
 'atlantean',
 'atlanteans',
 'atlantian',
 'atlantic',
 'atlantis']

In [19]:
# compute cosine similarity matrix

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim

array([[1.        , 0.        , 0.01963851, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.00569489, ..., 0.        , 0.        ,
        0.        ],
       [0.01963851, 0.00569489, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [21]:
# reverse mapping of indices and video game titles

indices = pd.Series(proc_data.index, index=proc_data['name'])#.drop_duplicates()

indices

name
The Legend of Zelda: Ocarina of Time              0
Tony Hawk's Pro Skater 2                          1
Grand Theft Auto IV                               2
SoulCalibur                                       3
Super Mario Galaxy                                4
                                              ...  
Charlie's Angels                              12148
Fast & Furious: Showdown                      12149
Drake of the 99 Dragons                       12150
Afro Samurai 2: Revenge of Kuma Volume One    12151
Infestation: Survivor Stories (The War Z)     12152
Length: 12153, dtype: int64

In [22]:
# Recommendation function that takes video game title 
# as input and outputs most similar video games

def recommender_system(title, cosine_sim=cosine_sim):
    
    # get index of video game that matches title
    
    idx = indices[title]
    
    # get pairwise similarity scores of all video games with the given title
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort games based on similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores of 10 most similar video games
    
    sim_scores = sim_scores[1:11]
    
    # get movie indices
    
    game_indices = [i[0] for i in sim_scores]
    
    # return top 10 most similar video games
    
    recs = proc_data['name'].iloc[game_indices]
    
    return recs

In [23]:
# test run

# open-world action game

recommender_system('Grand Theft Auto V')

2943                             Capcom Arcade Stadium
33                        Grand Theft Auto Double Pack
7759            Mega Man Battle Network 5: Double Team
623                                Unity of Command II
93                    Grand Theft Auto: Chinatown Wars
312           Grand Theft Auto IV: The Lost and Damned
5453              Evil Islands: Curse of the Lost Soul
6375     LEGO Indiana Jones 2: The Adventure Continues
10410                     Pitfall: The Mayan Adventure
5291                Tom Clancy's Rainbow Six: Lockdown
Name: name, dtype: object

In [24]:
# nintendo game

recommender_system('Super Mario Galaxy')

2169              NyxQuest: Kindred Spirits
761                            Pokemon Moon
2095      Bloodstained: Curse of the Moon 2
103                    Super Mario 3D World
10355                     PGA European Tour
9805     The Witch and the Hundred Knight 2
7501                                   Lume
14                      Super Mario Odyssey
9381                            Teen Titans
11702       The Uncertain: Light at the End
Name: name, dtype: object

In [25]:
# zombie survival game

recommender_system('Chernobylite')

7459                         NiGHTS: Journey of Dreams
7824                                 Buzz! Master Quiz
8464                             Dragon Age II: Legacy
11158                            World Series of Poker
8996     Heroes Chronicles: Conquest of the Underworld
7543                              Lips: Party Classics
6089          SOCOM: U.S. Navy Seals: Combined Assault
4661                    Alan Wake's American Nightmare
4697                                       Heart&Slash
1288                                         Far Cry 4
Name: name, dtype: object

In [26]:
# shooter game

recommender_system('Halo 2')

609                         WarioWare: Twisted!
68                                       Halo 3
1232                            MLB The Show 17
15                         Halo: Combat Evolved
2237    Monster Hunter Stories 2: Wings of Ruin
2661      Harvest Moon: Friends of Mineral Town
6998                         Turrican Flashback
2543                          The Artful Escape
257                                 Halo: Reach
2185                     Europa 1400: The Guild
Name: name, dtype: object

In [27]:
# anime fighting game

recommender_system('Naruto Shippuden: Ultimate Ninja Storm 4')

3448                              Trainz
4257                             Timelie
2592                          Planet Zoo
568                               NHL 09
962                      SoulCalibur III
8720           Tiger Woods PGA Tour Golf
8460                    Ragnarok Odyssey
3398    Summon Night: Swordcraft Story 2
6072                            Heroland
4672                            Pikuniku
Name: name, dtype: object

In [28]:
# superhero game

recommender_system('X-Men: Mutant Academy')

2338                                        Painkiller
909              DmC: Devil May Cry Definitive Edition
4912                                NCAA Basketball 10
6764                         Lionheart: Kings' Crusade
11415                          Theatre of War 3: Korea
9237                               SBK: Snowboard Kids
11922                   Doctor Who: The Eternity Clock
9057                                         SwapQuest
2394     Phoenix Wright: Ace Attorney - Dual Destinies
11428                                         NFL Tour
Name: name, dtype: object

In [29]:
# sport game

recommender_system('NBA 2K2')

11162                         Rabbids Land
4880                             Sparklite
11928                    Ju-on: The Grudge
7139        Law & Order: Dead on the Money
316                      NBA Street Vol. 2
5331     Need for Speed Underground Rivals
422                          NBA Street V3
2314                           Mega Man 10
4213                             Pure Pool
3751                             Humankind
Name: name, dtype: object

In [30]:
# for fun

recommender_system('Heavenly Sword')

3974                            Graceful Explosion Machine
538                                                  Mafia
6213                                         Endless Ocean
4050                                                  RUSH
11586                               Cooking Mama: Cookstar
2059                      Danganronpa: Trigger Happy Havoc
8200                                 Knockout Home Fitness
2509                                         Bit.Trip Flux
9275                                       Cargo Commander
10272    Disney's Magical Quest 3 Starring Mickey and D...
Name: name, dtype: object