# Imports

In [1]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

# pipeline imports
from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# scalers, encoder, knn, vectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder



%load_ext autoreload
%autoreload 2

In [2]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    #lemmatized = [lemmatizer.lemmatize(word) for word in lowercased]
    #lowercased = lemmatized     
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)

# Load Dataset 

In [16]:
df = pd.read_csv("../raw_data/clean_bigger_df.csv").drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,url,img_url,developer,requirements,name,metadata,clean_description,pegi_url,mature_encoded,all_reviews,clean_review
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,Valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,IndigoBlue Game Studio,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,NEXT Studios,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,Vertigo Gaming Inc.,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,DoubleC Games,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


In [19]:
df=df.rename(columns={'clean_review':'reviews','mature_encoded':'mature_content','clean_description':'game_description'})
df.head()

Unnamed: 0,url,img_url,developer,requirements,name,metadata,game_description,pegi_url,mature_content,all_reviews,reviews
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,Valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,IndigoBlue Game Studio,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,NEXT Studios,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,Vertigo Gaming Inc.,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,DoubleC Games,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


In [20]:
df.shape

(36412, 11)

# Preprocessing

In [21]:
def kmeans_labels(df, n , mi):
    vec = TfidfVectorizer(min_df = mi ,ngram_range=(1,2))
    X = vec.fit_transform(df['game_description'])
    kmodel = KMeans(n_clusters=n)
    kmodel.fit(X)
    
    return kmodel.labels_

In [22]:
def create_pipeline(df, m=0.05 , c=1, n =50, mi = 0.04):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    df['cluster'] = kmeans_labels(df, n, mi)
    
    
    meta_transf = make_pipeline(
        TfidfVectorizer(min_df=m), 
        array_transf,
        RobustScaler()
    )
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(
        ord_encoder, 
        StandardScaler())
    
    cluster_transf = make_pipeline(
        OneHotEncoder(sparse=False), 
         StandardScaler()
    )
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
        (meta_transf, 'metadata'),
        (cluster_transf, ['cluster']),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content']),
        remainder='drop'
    )
    
    full_pipe = make_pipeline(preproc_basic, PCA(n_components=c) )
    return full_pipe.fit_transform(df)
    
    #return preproc_basic.fit_transform(df)

In [23]:
def train(X, y):
    return KNeighborsRegressor().fit(X,y)

In [24]:
def recommending_system(model, X, game):
    
    neighbors_index = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[1][0]
    neighbors_distance = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[0][0]
    
    neighbors_list = list(neighbors_index)
    
    # new_df_values = {
    #     'distance': neighbors_distance,
    #     'url': [],
    #     'price': [],
    #     'reviews': [],
    #     'op_sys': [],
    #     'developer': [],
    # }
    
    # for index in neighbors_index:
    #     new_df_values['url'].append(df.loc[index, 'url'])
    #     new_df_values['price'].append(df.loc[index, 'price'])
    #     new_df_values['reviews'].append(df.loc[index, 'reviews'])
    #     new_df_values['op_sys'].append(df.loc[index, 'op_sys'])
    #     new_df_values['developer'].append(df.loc[index, 'developer'])
    
    return pd.DataFrame(neighbors_distance, index = X.iloc[neighbors_list, :].index, columns=['distance'])
    

In [28]:
pipe = create_pipeline(df, 0.03, 10, 70, 0.07)
pipe.shape
X = pd.DataFrame(pipe, index=df.name.tolist())
model = train(X, df['url'])
recommending_system(model, X, 'S.T.A.L.K.E.R.: Shadow of Chernobyl')

Unnamed: 0,distance
S.T.A.L.K.E.R.: Shadow of Chernobyl,0.000000
NARUTO SHIPPUDEN: Ultimate Ninja STORM 4,0.412144
System Shock 2,0.531875
Warframe,0.551153
Stellar Tactics,0.585972
...,...
Maestro: Dark Talent Collector's Edition,11.410017
Dangerous Games: Illusionist Collector's Edition,11.410017
Realm of Perpetual Guilds Demo,11.420086
12 Labours of Hercules Demo,11.423279


In [29]:
recommending_system(model, X, 'Left 4 Dead').head(10)

Unnamed: 0,distance
Left 4 Dead,0.0
Bio Inc. Redemption,0.535254
GUILTY GEAR XX ACCENT CORE PLUS R,0.564331
SURV1V3,0.570469
Oh...Sir!! The Insult Simulator,0.65216
Distance,0.652576
Blazing Beaks,0.653871
Puyo Puyo™Tetris®,0.676955
Ticket to Ride,0.752987
Circle Empires Rivals,0.779266


In [30]:
recommending_system(model, X, 'S.T.A.L.K.E.R.: Shadow of Chernobyl').head(10)

Unnamed: 0,distance
S.T.A.L.K.E.R.: Shadow of Chernobyl,0.0
NARUTO SHIPPUDEN: Ultimate Ninja STORM 4,0.412144
System Shock 2,0.531875
Warframe,0.551153
Stellar Tactics,0.585972
DEAD RISING®,0.736548
Two Worlds II HD,0.752972
Skullgirls,0.836717
FINAL FANTASY IV,0.861025
Sheltered,0.902273


In [32]:
recommending_system(model, X, 'QUAKE').head(10)

Unnamed: 0,distance
QUAKE,0.0
Hand Simulator,0.216387
N++ (NPLUSPLUS),0.28374
Project Arrhythmia,0.284179
Ben and Ed - Blood Party,0.284723
I Wanna Maker,0.303968
Ion Fury,0.314504
Spin Rhythm XD,0.318496
RED HOT VENGEANCE,0.369582
Paint the Town Red,0.380696


In [34]:
recommending_system(model, X, 'DOOM').head(10)

Unnamed: 0,distance
DOOM,0.0
Worms Revolution,0.021723
Overload,0.244663
Magicka,0.38895
GRIP: Combat Racing,0.414627
Mother Russia Bleeds,0.519912
Lethal League Blaze,0.635901
Wargroove,0.650668
SENRAN KAGURA Peach Ball,0.697086
State of Decay 2: Juggernaut Edition,0.730915


In [35]:
recommending_system(model, X, 'Counter-Strike').head(10)

Unnamed: 0,distance
Counter-Strike,0.0
GTFO,0.192935
Team Fortress Classic,0.21499
Tower Tag,0.251049
INSURGENCY: Modern Infantry Combat,0.274807
Fortress Forever,0.366578
Squad,0.400593
The Mean Greens - Plastic Warfare,0.53778
Dirty Bomb®,0.610479
"Pirates, Vikings, and Knights II",0.671821
