# Assignment #2 - SOSC5500 - Group4
    
**Student: WU Jinfeng, XU Muyao**

## Import Packages

In [1]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
import re
import collections
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn import svm
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

## Open the data

In [2]:
os.chdir(r"C:\Users\Jinfeng\OneDrive - HKUST Connect\Group_Work_SOSC5500\SOSC5500-Assignments\Assignment_2 - Due April 4\Data") 

train = pd.read_csv('game_train.csv')
test  = pd.read_csv('game_test.csv')

X_Train        =  train.drop(columns = ['user_suggestion','year','title'])
Y_Train        =  pd.get_dummies(train[['user_suggestion']])

X_Predict      =  test.drop(columns = ['year','title'])
Predict_ID     =  test['review_id']

## Functions for Word Stemming

In [3]:
porter = PorterStemmer() 

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

## Text Cleaning

In [4]:
def clean_text_round1(text):
    text = text.lower()
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n','', text)
    text = re.sub(r'[^a-zA-Z^ ]', '',  text)
    text = re.sub('  ', ' ',  text)
    text = stemSentence(text)

    return text

round1 = lambda x: clean_text_round1(x)

X_Train_clean   = pd.DataFrame(X_Train.user_review.apply(round1))
X_Predict_clean = pd.DataFrame(X_Predict.user_review.apply(round1))

## Document-Term Matrix based on TF-IDF Model

In [5]:
stop = list(stopwords.words('english'))
stop.extend(['write','game','would','say','review','year','access','also'])

vectorization = TfidfVectorizer(stop_words= stop,ngram_range = (1,4),min_df=3) ## 1-4_gram, and remove less frequent words
X_Train_tf    = vectorization.fit_transform(X_Train_clean.user_review)
X_Train_tf    = pd.DataFrame(X_Train_tf.toarray(), columns=vectorization.get_feature_names())

X_Predict_tf  = vectorization.transform(X_Predict_clean.user_review)
X_Predict_tf  = pd.DataFrame(X_Predict_tf.toarray(), columns=vectorization.get_feature_names())

## Merge the DTM and Game Information

In [6]:
title = pd.concat([train[['title','review_id']],test[['title','review_id']]],axis=0)
title['developer'] = 'NA'
title['publisher'] = 'NA'
title.index = range(0,17490,1)

game_info= pd.read_csv('games.csv') 

for i in range(len(title)):
    title['developer'][i] = list(game_info['developer'][game_info['title'] == title['title'][i]])[0]
    title['publisher'][i] = list(game_info['publisher'][game_info['title'] == title['title'][i]])[0]

ohe = OneHotEncoder()
var_categorical = ['title','developer','publisher']

title = pd.DataFrame(ohe.fit_transform(title[var_categorical]).toarray(), columns= ohe.get_feature_names())
title_train = title[title.index < 10494]
title_test = title[title.index >= 10494]
title_test.index = range(0,6996,1)

X_Train_all   = pd.concat([X_Train_tf, title_train],axis=1)
X_Predict_all = pd.concat([X_Predict_tf, title_test],axis=1)

## Supervised Learning with Different Models

In [None]:
## LASSO
model_lasso = LogisticRegression(C=5,penalty='l1', solver='liblinear')
scores_lasso = cross_val_score(model_lasso, X_Train_all, Y_Train.values.ravel(), cv=3,scoring='f1_macro')
print(scores_lasso) ## [0.85480919 0.84180381 0.84891316]

## Ridge
model_ridge = RidgeClassifier(alpha=0.8)
scores_ridge = cross_val_score(model_ridge, X_Train_all, Y_Train.values.ravel(), cv=3,scoring='f1_macro')
print(scores_ridge) ## [0.85906267 0.84457093 0.86414432]

## SVM
model_svm = svm.SVC()
scores_svm = cross_val_score(model_ridge, X_Train_all, Y_Train.values.ravel(), cv=3,scoring='f1_macro')
print(scores_svm) ## [0.84682203 0.83850338 0.85110879]

## Random Forest
model_RandomForest =RandomForestClassifier(n_estimators=200)
scores_rf = cross_val_score(model_RandomForest, X_Train_all, Y_Train.values.ravel(), cv=3,scoring='f1_macro')
print(scores_rf) ## [0.82727251 0.81669429 0.82981801]

## Boosting Trees
model_BT = GradientBoostingClassifier()
scores_bt = cross_val_score(model_BT, X_Train_all, Y_Train.values.ravel(), cv=3,scoring='f1_macro')
print(scores_bt) ## [0.80030397 0.78173007 0.78261874]

## Tune Parameters for Lasso Model and Ridge Model

The F-1 scores of Lasso model and Ridge model are relatively better, so we choose to tune parameters for these two models. Results show that when C=5, Lasso model can produce the best result; and when Alpha=0.8, Ridge model can produce the best result.

In [None]:
## Tune Parameters for Lasso Model

param_grid = {'C': [i for i in range(1,100,2)],
              'penalty':['l1'],
              'solver':['liblinear']}

bt_Grid = GridSearchCV(estimator  = LogisticRegression(), 
                       param_grid = param_grid, cv = 3, verbose=3, scoring='f1')
bt_Grid.fit(X_Train_tf,Y_Train.values.ravel())
bt_Grid.best_params_ ## [C=5] 

## Tune Parameters for Ridge Model

param_grid = {'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}

bt_Grid = GridSearchCV(estimator  = RidgeClassifier(), 
                       param_grid = param_grid, cv = 3, verbose=3, scoring='f1')
bt_Grid.fit(X_Train_tf,Y_Train.values.ravel())
bt_Grid.best_params_ ## [Alpha:0.8] 

## Combine the Prediction from Lasso and Ridge Model

In this part, we combined the result from different models. Because the F-1 of Lasso, Ridge models are relatively high, we only consider these two models in this part. If a case is identifies as Yes by Lasso Model or Ridge Model, we code it as Yes. Otherwise, we code a case as No.

With this method, we got the best prediction result and the F-1 score is 0.86077.

In [None]:
c = []
for i in range(len(Y_Predict_Lasso)):
    ci = Y_Predict_Lasso[i]+ Y_Predict_ridge[i]
    if ci>=1:
        c.append(1)
    elif ci<1:
        c.append(0)