# Imports

In [18]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
#import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer

#from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    #lemmatized = [lemmatizer.lemmatize(word) for word in lowercased]
    #lowercased = lemmatized     
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)

# Load Dataset 

In [5]:
df = pd.read_csv('../raw_data/clean_df.csv' )
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the...",0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS


In [6]:
df.isnull().sum()

url                     0
name                    3
developer              34
tags                    3
languages               0
genre                   5
game_description        0
mature_content          0
price                4558
reviews                 0
date                  259
achievements            0
op_sys              12383
dtype: int64

In [7]:
df['tags'].fillna('', inplace=True)
df['genre'].fillna('', inplace=True)

In [8]:
df['metadata'] = df[['tags', 'genre']].apply(lambda x: ' '.join(x), axis = 1)


df['metadata'] = df['metadata'].apply(
    lambda x: ' '.join(list(set(x.split())))
)

In [9]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the...",0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,"Action FPS,Gore,Action,Demons,Shooter,First-Pe..."
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,"Action,Adventure,Massively Survival,Shooter,Mu..."
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,"Rich,Great Tactics,Sci-fi,Turn-Based Soundtrac..."
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,"Action,Adventure,Massively Access,Simulation,F..."
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,"Action,Free World,RPG,PvP,Multiplayer,Free Spa..."


In [10]:
df['metadata'] = df['metadata'].apply(clean_text)

In [11]:
df['game_description'] = df['game_description'].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,action fps gore action demon shooter first per...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,action adventure massively survival shooter mu...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,rich great tactic sci fi turn based soundtrack...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,action adventure massively access simulation f...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,action free world rpg pvp multiplayer free spa...


# Preprocessing

In [13]:
reviews_rating = [
    "Overwhelmingly Negative",
    "Very Negative",
    "Negative",
    "Mostly Negative",
    'Mixed',
    "Mostly Positive",
    "Positive",
    "Very Positive",
    "Overwhelmingly Positive"
]

In [59]:
array_transf = FunctionTransformer(lambda array: array.toarray())

text_transformer_1 = make_pipeline(TfidfVectorizer(min_df=0.1), array_transf,RobustScaler())

text_transformer_2 = make_pipeline(TfidfVectorizer(min_df=0.05),array_transf, RobustScaler())


ord_transformer = make_pipeline(
    OrdinalEncoder(
        categories=[reviews_rating],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    ), StandardScaler())

num_transformer = make_pipeline(StandardScaler())


preproc_basic = make_column_transformer(
    (text_transformer_1, ['metadata']),
    (text_transformer_2, ['game_description']),
    (ord_transformer, ['reviews']),
    (num_transformer, ['mature_content', 'achievements']),
    remainder='drop'
)

preproc_basic

In [60]:
vec = TfidfVectorizer(min_df = 0.05).fit(df.metadata)
vectors = vec.transform(df.metadata)
vectors.shape

(24567, 47)

In [57]:
encoder = OrdinalEncoder(
    categories=[reviews_rating],
    dtype=np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

encoder.fit(df[['metadata']])

rating = encoder.transform(df[['metadata']])
rating.shape

(24567, 1)

In [61]:
preproc_basic.fit_transform(df)

ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.