In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Lets read the data

In [2]:
df1=pd.read_csv("all_games.csv")

## since we need only the name and summary for nlp we take these two

In [3]:
recom=df1[['name','summary']]

In [4]:
recom['name']

0              The Legend of Zelda: Ocarina of Time
1                          Tony Hawk's Pro Skater 2
2                               Grand Theft Auto IV
3                                       SoulCalibur
4                               Grand Theft Auto IV
                            ...                    
18795                      Fast & Furious: Showdown
18796                       Drake of the 99 Dragons
18797    Afro Samurai 2: Revenge of Kuma Volume One
18798     Infestation: Survivor Stories (The War Z)
18799           Leisure Suit Larry: Box Office Bust
Name: name, Length: 18800, dtype: object

In [5]:
## since there are names which comes twice. we can delete these names cause the summary would be the same an dwhen recommending it would get the same result
recom.drop_duplicates(subset='name',keep='first',inplace=True)
recom.reset_index(drop=True, inplace=True)

In [6]:
## dropping the rows where sum is nan.
drop=recom[recom['summary'].isnull()].index

In [7]:
recom.drop(index=drop,inplace=True)

In [8]:
recom

Unnamed: 0,name,summary
0,The Legend of Zelda: Ocarina of Time,"As a young boy, Link is tricked by Ganondorf, ..."
1,Tony Hawk's Pro Skater 2,As most major publishers' development efforts ...
2,Grand Theft Auto IV,[Metacritic's 2008 PS3 Game of the Year; Also ...
3,SoulCalibur,"This is a tale of souls and swords, transcendi..."
4,Super Mario Galaxy,[Metacritic's 2007 Wii Game of the Year] The u...
...,...,...
12249,Charlie's Angels,"Join Natalie, Dylan, and Alex for an intense a..."
12250,Fast & Furious: Showdown,Fast & Furious: Showdown takes some of the fra...
12251,Drake of the 99 Dragons,Drake is out for revenge in a supernatural Hon...
12252,Afro Samurai 2: Revenge of Kuma Volume One,"Head out on a journey of redemption, driven by..."


## LETS DO NLP

* we need to import the recessary libraries

In [67]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import wordnet
from nltk import pos_tag
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
## THIS WORKS LIKE A TEXT NORMALISATION PIPELINE (ANY TEXT THATS GOES IN COMES OUT CLEAN)

def nlp(df, text):
    
    #astype is used to convert into string
    df['dummy'] = df[text].astype(str)
    
    # Convert to lowercase
    dummy = (post.lower() for post in df['dummy'])
    df['dummy'] = [i for i in dummy]

    # breaking the sentence into tokens(words)
    token_post = (word_tokenize(post) for post in df['dummy'])
    token_post = [i for i in token_post]

    
    # Remove Punctuation
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

    no_punc = []

    for filt in token_post:
        review = []
        for token in filt:
            new_token = reg.sub(u'', token)
            if not new_token == u'':
                review.append(new_token)
        no_punc.append(review)
        
    # Remove Stopwords
    no_stop = []

    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)

        no_stop.append(new_term_vector)
         
        # Stemming & Lemmatization(cutting down the tail)
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()

    preproc_text = []

    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))

        preproc_text.append(final_text)
        
   

    new_col = pd.Series(preproc_text)
    df['proc_summary'] = new_col
    df.drop('dummy', axis=1, inplace=True)
    return df

proc_data = nlp(recom, 'summary')

In [53]:
proc_data.head()

Unnamed: 0,name,summary,proc_summary
0,The Legend of Zelda: Ocarina of Time,"As a young boy, Link is tricked by Ganondorf, ...","[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,As most major publishers' development efforts ...,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,[Metacritic's 2008 PS3 Game of the Year; Also ...,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,"This is a tale of souls and swords, transcendi...","[tale, soul, sword, transcending, world, histo..."
4,Super Mario Galaxy,[Metacritic's 2007 Wii Game of the Year] The u...,"[metacritic, 2007, wii, game, year, ultimate, ..."


In [52]:
proc_data.dropna(inplace=True)

## next step is converting the tokens into vectors(TFID vectorizer objects)

In [54]:

tfidf = TfidfVectorizer()

tfidf.fit([str(i) for i in proc_data['proc_summary']])

tfidf_matrix = tfidf.transform([str(i) for i in proc_data['proc_summary']])



In [55]:
# Output(we vectorized the summary success)
print(tfidf_matrix.shape)

(12053, 38141)


In [56]:
# compute cosine similarity matrix
from sklearn.metrics.pairwise import linear_kernel # for cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim

array([[1.        , 0.        , 0.01963686, ..., 0.00588318, 0.00663924,
        0.01384173],
       [0.        , 1.        , 0.00565785, ..., 0.        , 0.01094931,
        0.        ],
       [0.01963686, 0.00565785, 1.        , ..., 0.        , 0.        ,
        0.04130629],
       ...,
       [0.00588318, 0.        , 0.        , ..., 1.        , 0.02312863,
        0.01493381],
       [0.00663924, 0.01094931, 0.        , ..., 0.02312863, 1.        ,
        0.        ],
       [0.01384173, 0.        , 0.04130629, ..., 0.01493381, 0.        ,
        1.        ]])

In [59]:
# assigning index to the title

indices = pd.Series(proc_data.index, index=proc_data['name'])

indices

name
The Legend of Zelda: Ocarina of Time        0
Tony Hawk's Pro Skater 2                    1
Grand Theft Auto IV                         2
SoulCalibur                                 3
Super Mario Galaxy                          4
                                        ...  
You Are Empty                           12148
Bomberman: Act Zero                     12149
McFarlane's Evil Prophecy               12150
Chicago Enforcer                        12151
Shrek: Super Party                      12152
Length: 12053, dtype: int64

In [68]:
def APEX(title, cosine_sim=cosine_sim):
       
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    game_indices = [i[0] for i in sim_scores]
    recs = proc_data['name'].iloc[game_indices]  
    return recs

## APEX GRE

In [61]:
APEX('Grand Theft Auto IV')

36                   Grand Theft Auto: Vice City
624                          Unity of Command II
312     Grand Theft Auto IV: The Lost and Damned
6022                    Raiden V: Director's Cut
5443                                 Strangeland
Name: name, dtype: object

In [66]:
def APEX_GRE():
    print("Welcome to APEX GRE !\n")
    while True:
        user_input = input("Please write a game's Title")
        
        if user_input.lower() == "adios":
            print("\nThank you for using the GRE. Adios!")
            break
        
        similar_games = sumrec(user_input)
        
        print("Here are the games that are similar to the Title you mentioned:")
        print(similar_games)
        print()

In [83]:
APEX_GRE()

Welcome to APEX GRE !



Please write a game's Title Grand Theft Auto: Vice City


Here are the games that are similar to the Title you mentioned:
33            Grand Theft Auto Double Pack
2943    Super Mega Baseball: Extra Innings
623        Hearthstone: Heroes of Warcraft
7759                           Hyper Scape
93        Grand Theft Auto: Chinatown Wars
Name: name, dtype: object



Please write a game's Title adios



Thank you for using the GRE. Adios!


Welcome to APEX GRE !



Please write a game's Title adios



Thank you for using the GRE. Adios!


## MAKING A RECOMMENDATION ENGINE BASED ON INPUT TEXT

In [70]:
# Text Normalization
def text_normalization(text):
    text = str(text).lower()
    spl_char_text = re.sub(r'[^ a-z]', '', text)
    tokens = word_tokenize(spl_char_text)
    lemma = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens)

    lemma_words = []
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lemma_token=lemma.lemmatize(token,pos_val) 
        lemma_words.append(lemma_token) 
    return " ".join(lemma_words)

In [71]:
def sumrec(text):
    lemma = text_normalization(text)
    tfidf_matrix2 = tfidf.transform([lemma]).toarray()

    # Calculate cosine similarity between the input tfidf matrix and the whole dataset tfidf matrix
    cosine_sim_input = cosine_similarity(tfidf_matrix2, tfidf_matrix)

    # Get indices of similar games based on the highest similarity scores
    similar_game_indices = cosine_sim_input.argsort()[0][-5:][::-1]

    # Get the names of the similar games from the dataset
    similar_games = proc_data.loc[similar_game_indices, 'name']

    return similar_games

In [76]:
sumrec("As a first-person shooter, Call of Duty places the player in control of an infantry soldier who makes use of various authentic World War II firearms in combat. Each mission features a series of objectives that are marked on the heads-up display's compass; the player must complete all objectives to advance to the next mission. The player can save and load at any time, rather than the checkpoint system utilized in later Call of Duty games.")

3913    Star Wars Republic Commando
1188                       Judgment
3086           Unforeseen Incidents
4526     Solatorobo: Red the Hunter
290                    Call of Duty
Name: name, dtype: object

In [77]:
def chat_bot():
    print("Welcome to the game recommendation chat bot!\n")
    while True:
        user_input = input("Please write a game summary: \n")
        
        if user_input.lower() == "exit":
            print("Thank you for using the chat bot. Goodbye!")
            break
        
        # Process the user's input
        similar_games = sumrec(user_input)
        
        print("Here are the games that are similar to the summary you wrote:\n")
        print(similar_games)
        print()

In [87]:
def APEX_GRE_2():
    print("Welcome to APEX GRE ver 2!\n")
    while True:
        user_input = input("Please write a game summary")
        
        if user_input.lower() == "adios":
            print("\nThank you for using the GRE2. ADIOS!")
            break
        
        similar_games = sumrec(user_input)
        
        print("Here are the games that are similar to the summary you wrote:")
        print(similar_games)
        print()


## APEX GRE 2

In [None]:
APEX_GRE_2()

Welcome to APEX GRE ver 2!

