# Steam Game Review Aspect Classification

In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/steam-aspect-training/steam-aspect.csv


## Read Data

In [2]:
data = pd.read_csv('../input/steam-aspect-training/steam-aspect.csv')
data.head()

Unnamed: 0,word_before,word_now,word_after,pos_tag,class
0,[START],My,first,PRP$,False
1,My,first,game,JJ,False
2,first,game,on,NN,True
3,game,on,A3,IN,False
4,on,A3,brought,NNP,False


## Combine columns

In [3]:
aspect_data = pd.DataFrame()
arr_words = []

for i in range(len(data['word_now'])):
    word = ""
    if (data['word_before'][i] != '[START]'):
        word += str(data['word_before'][i])
        
    word += " " + str(data['word_now'][i])
    
    if (data['word_after'][i] != '[END]'):
        word += " " + str(data['word_after'][i])
    
    word += " " + str(data['pos_tag'][i])
    
    arr_words.append(word)

aspect_data['review'] = arr_words
aspect_data['class'] = data['class'].copy()
aspect_data.head()

Unnamed: 0,review,class
0,My first PRP$,False
1,My first game JJ,False
2,first game on NN,True
3,game on A3 IN,False
4,on A3 brought NNP,False


## Train Test Split Data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(aspect_data['review'], aspect_data['class'], test_size=0.33)

## Feature Extraction

In [5]:
tfidf = TfidfVectorizer(binary=True, use_idf = True, max_features=256)
tfidf = tfidf.fit(X_train)

X_train_tfidf = pd.DataFrame(tfidf.transform(X_train).toarray(), columns=[tfidf.get_feature_names()])
X_test_tfidf = pd.DataFrame(tfidf.transform(X_test).toarray(), columns=[tfidf.get_feature_names()])

X_train_tfidf

Unnamed: 0,10,about,actually,after,again,ai,all,almost,alpha,also,...,why,will,with,work,worth,would,wp,wrb,you,your
0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794579
3,0.598762,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.769569,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48029,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48030,0.000000,0.0,0.0,0.0,0.0,0.596899,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48031,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48032,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


## Classification

### 1. Logistic Regression

In [6]:
lg = LogisticRegression(C=1000, solver='liblinear')

In [7]:
lg.fit(X_train_tfidf, y_train)

LogisticRegression(C=1000, solver='liblinear')

In [8]:
lg.score(X_test_tfidf, y_test)

0.7862637362637362

### 2. SVM

In [9]:
svc = SVC(C=1, kernel='linear')

In [10]:
svc.fit(X_train_tfidf, y_train)

SVC(C=1, kernel='linear')

In [11]:
svc.score(X_test_tfidf, y_test)

0.7718934911242603

## Save Model

In [13]:
pickle.dump(lg, open("aspect_lg.p", "wb"))

In [15]:
pickle.dump(svc, open("aspect_svc.p", "wb"))