# Steam Game Review Aspect Classification

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Data

In [None]:
data = pd.read_csv('../input/steam-aspect-training/steam-aspect.csv')
data.head()

## Combine columns

In [None]:
aspect_data = pd.DataFrame()
aspect_data_no_pos_tag = pd.DataFrame()
arr_words = []
arr_words_no_pos_tag = []

for i in range(len(data['word_now'])):
    word = ""
    if (data['word_before'][i] != '[START]'):
        word += str(data['word_before'][i])
        
    word += " " + str(data['word_now'][i])
    
    if (data['word_after'][i] != '[END]'):
        word += " " + str(data['word_after'][i])
    
    arr_words_no_pos_tag.append(word)
    
    word_pos_tag = word + " " + str(data['pos_tag'][i])
    
    arr_words.append(word_pos_tag)

aspect_data['review'] = arr_words
aspect_data['class'] = data['class'].copy()
aspect_data_no_pos_tag['review'] = arr_words_no_pos_tag
aspect_data_no_pos_tag['class'] = data['class'].copy()
aspect_data_no_pos_tag.head()

## Train Test Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(aspect_data['review'], aspect_data['class'], test_size=0.33)
X_train_no_pos_tag, X_test_no_pos_tag, y_train_no_pos_tag, y_test_no_pos_tag = train_test_split(aspect_data_no_pos_tag['review'], aspect_data_no_pos_tag['class'], test_size=0.33)

## Feature Extraction

### 1. With Pos Tag

In [None]:
tfidf = TfidfVectorizer(binary=True, use_idf = True, max_features=256)
tfidf = tfidf.fit(X_train)

X_train_tfidf = pd.DataFrame(tfidf.transform(X_train).toarray(), columns=[tfidf.get_feature_names()])
X_test_tfidf = pd.DataFrame(tfidf.transform(X_test).toarray(), columns=[tfidf.get_feature_names()])

X_train_tfidf

### 2. No Pos Tag

In [None]:
tfidf_no_pos_tag = TfidfVectorizer(binary=True, use_idf = True, max_features=256)
tfidf_no_pos_tag = tfidf_no_pos_tag.fit(X_train_no_pos_tag)

X_train_tfidf_no_pos_tag = pd.DataFrame(tfidf_no_pos_tag.transform(X_train_no_pos_tag).toarray(), columns=[tfidf_no_pos_tag.get_feature_names()])
X_test_tfidf_no_pos_tag = pd.DataFrame(tfidf_no_pos_tag.transform(X_test_no_pos_tag).toarray(), columns=[tfidf_no_pos_tag.get_feature_names()])

X_train_tfidf_no_pos_tag

## Classification

### 1.a. Logistic Regression With Pos Tag

In [None]:
lg = LogisticRegression(C=1000, solver='liblinear')

In [None]:
lg.fit(X_train_tfidf, y_train)

In [None]:
lg.score(X_test_tfidf, y_test)

### 1.b. Logistic Regression Without Pos Tag

In [None]:
lg_no_pos_tag = LogisticRegression(C=1000, solver='liblinear')

In [None]:
lg_no_pos_tag.fit(X_train_tfidf_no_pos_tag, y_train_no_pos_tag)

In [None]:
lg_no_pos_tag.score(X_test_tfidf_no_pos_tag, y_test_no_pos_tag)

### 2.a. SVM With Pos Tag

In [None]:
svc = SVC(C=1, kernel='linear')

In [None]:
svc.fit(X_train_tfidf, y_train)

In [None]:
svc.score(X_test_tfidf, y_test)

### 2.a. SVM Without Pos Tag

In [None]:
svc_no_pos_tag = SVC(C=1, kernel='linear')

In [None]:
svc_no_pos_tag.fit(X_train_tfidf_no_pos_tag, y_train_no_pos_tag)

In [None]:
svc_no_pos_tag.score(X_test_tfidf_no_pos_tag, y_test_no_pos_tag)

## Save Model

In [None]:
pickle.dump(lg, open("aspect_lg.p", "wb"))

In [None]:
pickle.dump(lg_no_pos_tag, open("aspect_lg_no_pos_tag.p", "wb"))

In [None]:
pickle.dump(svc, open("aspect_svc.p", "wb"))

In [None]:
pickle.dump(svc_no_pos_tag, open("aspect_svc_no_pos_tag.p", "wb"))