In [1]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import nltk

# pipeline imports
from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# scalers, encoder, knn, vectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder



%load_ext autoreload
%autoreload 2

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pyfenix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pyfenix/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pyfenix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
#     lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(without_stopwords)

In [4]:
def re_clean(text):
    text_t=[text].copy()
    temp=text_t[0].split(' ')
    non_imp_words=["game","games","gaming","studios","inc","studio"]
    for word in non_imp_words:
        while word in temp:
            temp.remove(word)
    return " ".join(temp)

 # Load Dataset 

In [5]:
df = pd.read_csv("../raw_data/clean_bigger_df.csv").drop(columns='Unnamed: 0').rename(columns={'clean_review':'reviews','mature_encoded':'mature_content','clean_description':'game_description'})
df.head()


Unnamed: 0,url,img_url,developer,requirements,name,metadata,game_description,pegi_url,mature_content,all_reviews,reviews
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,Valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,IndigoBlue Game Studio,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,NEXT Studios,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,Vertigo Gaming Inc.,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,DoubleC Games,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


## Cleaning developer column

In [8]:
df['developer'] = df.developer.astype('str').apply(clean_text).apply(re_clean)
df.head()

Unnamed: 0,url,img_url,developer,requirements,name,metadata,game_description,pegi_url,mature_content,all_reviews,reviews
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,indigoblue,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,next,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,vertigo,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,doublec,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


# Preprocessing

In [21]:
def kmeans_labels(df, n , m):
    vec = TfidfVectorizer(min_df = m ,ngram_range=(1,2))
    X = vec.fit_transform(df['game_description'])
    kmodel = KMeans(n_clusters=n)
    kmodel.fit(X)
    
    return kmodel.labels_

In [23]:
def create_pipeline(df, m=0.05 , c=1, n =50, mi = 0.04, mii=0.01):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    df['cluster'] = kmeans_labels(df, n, mi)
    
    dev_transf=make_pipeline(
        TfidfVectorizer(min_df=mii),
        array_transf,
        RobustScaler()
    )
    
    desc_transf=make_pipeline(
        TfidfVectorizer(min_df = mi ,ngram_range=(1,2)),
        array_transf,
        RobustScaler()
    )
    
#     meta_transf = make_pipeline(
#         TfidfVectorizer(min_df=m), 
#         array_transf,
#         RobustScaler()
#     )
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(
        ord_encoder, 
        StandardScaler())
    
    cluster_transf = make_pipeline(
        OneHotEncoder(sparse=False), 
         StandardScaler()
    )
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
#         (meta_transf, 'metadata'),
        (dev_transf, 'developer'),
        (cluster_transf, ['cluster']),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content']),
        (desc_transf, ['game_description']),
        remainder='drop'
    )
    
    full_pipe = make_pipeline(preproc_basic, PCA(n_components=c) )
    return full_pipe.fit_transform(df)
    
    #return preproc_basic.fit_transform(df)

In [11]:
def train(X, y):
    return KNeighborsRegressor().fit(X,y)

In [12]:
def recommending_system(model, X, game):
    
    neighbors_index = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[1][0]
    neighbors_distance = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[0][0]
    
    neighbors_list = list(neighbors_index)
    
    # new_df_values = {
    #     'distance': neighbors_distance,
    #     'url': [],
    #     'price': [],
    #     'reviews': [],
    #     'op_sys': [],
    #     'developer': [],
    # }
    
    # for index in neighbors_index:
    #     new_df_values['url'].append(df.loc[index, 'url'])
    #     new_df_values['price'].append(df.loc[index, 'price'])
    #     new_df_values['reviews'].append(df.loc[index, 'reviews'])
    #     new_df_values['op_sys'].append(df.loc[index, 'op_sys'])
    #     new_df_values['developer'].append(df.loc[index, 'developer'])
    
    return pd.DataFrame(neighbors_distance, index = X.iloc[neighbors_list, :].index, columns=['distance'])

# Testing Data

In [13]:
user_df = pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])

In [14]:
# keeping only play entries

df_play = user_df[user_df['behavior']=='play']
df_play=df_play.drop(columns='behavior')

#keeping only games that are also in the main dataset
user_name= pd.DataFrame(df_play['game'].unique(),columns=['name']).merge(df, on = 'name')
join_name = list(user_name.name.unique())
df_play = df_play[df_play['game'].isin(join_name)]

In [15]:
# Creating DF of users favorites 2 games

def get_fav_games(df,user):
    db = df[df['userid']==user].sort_values(by='hoursplayed', ascending=False)
    return list(db['game'].iloc[0:2])

def get_user_list(df):
    temp_df=df.groupby('userid').count()[['game']]
    return list(temp_df[temp_df['game']>1].index)

def get_fav_list(df):
    user_list= get_user_list(df)
    fav_list=[]
    for user in user_list:
        fav_list.append(get_fav_games(df,user))
    fav1=[]
    fav2=[]
    for fav in fav_list:
        fav1.append(fav[0])
        fav2.append(fav[1])
    return pd.DataFrame(data=list(zip( fav1, fav2)),
                         columns=['most_fav_game', 'sec_fav_game'],index=user_list)

In [None]:
test_df = get_fav_list(df_play)

In [None]:
def testing_models(df, model):

    df['distance'] = ''
    for index, row in df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=X.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=X.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = ['distance']).loc[row['sec_fav_game']][0]
        df.loc[index, 'distance'] = res
        
    return df

# Testing Params

In [16]:
def test_params(test_df,df, X, model, name, index_name):
    for index, row in test_df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = [name])
        test_df.loc[index, name] = res.loc[row['sec_fav_game']][0]
        test_df.loc[index, index_name]= res.index.get_loc(row['sec_fav_game'])
    return test_df

In [17]:
def check_params(df, m=0.05 , c=1, n =50, mi = 0.1):
    pipe = create_pipeline(df, m=0.05 , c=1, n =50, mi = 0.04)
    X = pd.DataFrame(pipe, index=df.name.tolist())
    model = train(X, df['url'])
    name = str(m)
    index_name = 'index' + str(m)
    test_params(test_df,df, X, model, name, index_name)
    return test_df

In [18]:
m_list = [0.03, 0.05, 0.7]
c_list = [1,5,10]
n_list = [10, 50, 70]
mi_list = [0.05, 0.07, 0.1]
mii_list=[0.01,0.02,0.03,0.04,0.05]

In [19]:
main_list = [m_list, c_list, n_list, mi_list, mii_list]

In [20]:
import itertools
params_list = list(itertools.product(*main_list))
len(params_list)

405

In [None]:
# fun loop to go through!
count=50
for params in params_list[51:75]:
    check_params(df, params)
    count +=1
    print(count)