### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
import xgboost as xgb

### Data Loading (only using 10000 entries for testing purposes)

In [None]:
trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

predictionSet = pd.merge(trainingSet, testingSet, left_on='Id', right_on='Id')
print(predictionSet.columns)

predictionSet = predictionSet.drop(columns=['Score_x'])
predictionSet = predictionSet.rename(columns={'Score_y': 'Score'})

print(predictionSet.columns)
predictionSet.to_csv("./data/prediction.csv", index=False)

X_train = trainingSet[trainingSet['Score'].notnull()]
print(trainingSet.shape)
print(X_train.shape)
X_train.to_csv("./data/X_train.csv", index=False)

In [9]:
train_data = pd.read_csv("./data/X_train.csv").head(10000)
pred_data = pd.read_csv("./data/prediction.csv").head(10000)

print(train_data.shape)
print(pred_data.shape)
print(train_data.head())
print(pred_data.head())

(10000, 9)
(10000, 9)
   Id   ProductId          UserId  HelpfulnessNumerator  \
0   0  0005019281   ADZPIG9QOCDG5                     0   
1   1  0005019281  A35947ZP82G7JH                     0   
2   2  0005019281  A3UORV8A9D5L2E                     0   
3   3  0005019281  A1VKW06X1O2X7V                     0   
4   4  0005019281  A3R27T4HADWFFJ                     0   

   HelpfulnessDenominator  Score        Time  \
0                       0    4.0  1203984000   
1                       0    3.0  1388361600   
2                       0    3.0  1388361600   
3                       0    5.0  1202860800   
4                       0    4.0  1387670400   

                                        Summary  \
0                     good version of a classic   
1                        Good but not as moving   
2         Winkler's Performance was ok at best!   
3  It's an enjoyable twist on the classic story   
4                              Best Scrooge yet   

                           

### Data pre-processing

In [10]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def remove_punc(text):         
    return text.translate(str.maketrans('', '', string.punctuation))

def clean_helper(text):
    return " ".join([stemmer.stem(remove_punc(w)).lower() for w in text if w.lower() not in stop_words]).strip()

#the main data cleaning function
def data_cleaning(df):
    df = df.apply(word_tokenize)
    df = df.apply(clean_helper)
    
    return df

In [11]:
# We only care about the 'Text' and 'Summary' columns


#replace Nan with empty spaces
train_data['Text'].fillna('', inplace=True) 
train_data['Summary'].fillna('', inplace=True) 


# make sure there is space in between when combining 'Summary' and 'Text'
train_data['Summary'] = train_data['Summary'].apply(lambda x: x+" ") 


X_data = train_data['Summary'] + train_data['Text'] #text
Y_data = train_data['Score'] #labels

X_data_cleaned = data_cleaning(X_data)# remove puncs, stop words, lower case, stemming, and etc.

X_train, X_test, y_train, y_test = train_test_split(X_data_cleaned, Y_data, test_size=0.2, random_state=0)


#I use TfidfVectorizer as text vectorizer
vec = TfidfVectorizer(sublinear_tf=True, min_df=2, max_df=0.9 , ngram_range=(1,3))

X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

### Model Training

In [12]:
svc = LinearSVC(class_weight="balanced")# class_weight deals with imbalance data set
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(mean_squared_error(y_test, y_pred))# use mean square error as evaluation metric

0.8755


### Use the model to predict our data

In [13]:
pred_data['Text'].fillna('', inplace=True)
pred_data['Summary'].fillna('', inplace=True)
pred_data['Summary'] = pred_data['Summary'].apply(lambda x: x+" ")
x_predict = pred_data['Summary'] + pred_data['Text']
y_predict = pred_data['Score']


x_predict_cleaned = data_cleaning(x_predict)

x_predict_cleaned = vec.transform(x_predict_cleaned)
result = svc.predict(x_predict_cleaned)

In [14]:
pred_data['Score'] = result
submission = pred_data[['Id', 'Score']]
print(submission.head())
#submission.to_csv("./data/submission.csv", index=False)

   Id  Score
0   5    5.0
1  11    3.0
2  17    4.0
3  46    5.0
4  47    5.0
