# Create DataFrames

In [13]:
import pandas as pd
import re
from random import shuffle
from os import system

In [14]:
csv_dir = 'csv/annotation01_tsukuba_corpus_20140930.tsv'

reader = pd.read_csv(csv_dir, sep='\t', usecols=[4, 5], header=None)
#reader.sample(10) 

In [15]:
def append_classify_sentence(sentimentList: list, csvTable: pd.DataFrame, labelName: str) -> list:
    for idx,data in csvTable.iterrows():
        if (data[4] == labelName):
            sentimentList.append((data[5], data[4]))

dataList = []
append_classify_sentence(dataList, reader, 'p')
append_classify_sentence(dataList, reader, 'k')
append_classify_sentence(dataList, reader, 'y')
append_classify_sentence(dataList, reader, 'e')
shuffle(dataList) # 3157 records in total

train_list = dataList[:2657]
test_list = dataList[-500:]

def create_data_frame(tlist: list) -> pd.DataFrame:
    text, label = tuple(zip(*tlist))
    text = list(map(lambda txt: re.sub('(<br\s*/?>)+', ' ', txt), text)) # replacing line breaks with spaces

    return pd.DataFrame({'text': text, 'label': label})

tsukuba_train = create_data_frame(train_list)
tsukuba_test = create_data_frame(test_list)
# tsukuba_train
# tsukuba_test

In [21]:
tsukuba_train.to_csv('csv/tsukuba_train.csv', index=False)
tsukuba_test.to_csv('csv/tsukuba_test.csv', index=False)

Unnamed: 0,text,label
0,子連れだと、気になること等色々あるのですが、みんな楽しく過ごせました。,p
1,ちょっと接客慣れしていない感じの方が２人ほど対応してくださっていましたが、他にチェックインな...,k
2,お得なプランで満足です。,p
3,また隣接するショッピングモールで買い物をすると観覧車が無料になるとネットで見たのであの広いモ...,k
4,バスルームにも違いがあり、今回は浴槽の水深が深いタイプで、肩までどっぷり漬かれました。,p
...,...,...
2652,ちなみにチェックアウトは余りにもスムーズで“ウェルカム”ってよりは“ウェルリーヴ”ってのがこ...,k
2653,希望の部屋もこちらの無理を言ってお願いしましたが用意してくれてあって良かったです。,p
2654,一人で泊まれる温泉旅館のプランは他にはあまりないので、このプランはとてもありがたいです。,p
2655,駅からは遠いが繁華街が近いのはメリット。,e


# Text Vectorization

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
import nagisa # A Japanese tokenizer to ensure CountVectorizer can be executed correctly

Unnamed: 0,text,label
0,子連れだと、気になること等色々あるのですが、みんな楽しく過ごせました。,p
1,ちょっと接客慣れしていない感じの方が２人ほど対応してくださっていましたが、他にチェックインな...,k
2,お得なプランで満足です。,p
3,また隣接するショッピングモールで買い物をすると観覧車が無料になるとネットで見たのであの広いモ...,k
4,バスルームにも違いがあり、今回は浴槽の水深が深いタイプで、肩までどっぷり漬かれました。,p
...,...,...
2652,ちなみにチェックアウトは余りにもスムーズで“ウェルカム”ってよりは“ウェルリーヴ”ってのがこ...,k
2653,希望の部屋もこちらの無理を言ってお願いしましたが用意してくれてあって良かったです。,p
2654,一人で泊まれる温泉旅館のプランは他にはあまりないので、このプランはとてもありがたいです。,p
2655,駅からは遠いが繁華街が近いのはメリット。,e


In [45]:
## define a japanese tokenizer
def tokenize_jp(doc):
    doc = nagisa.filter(doc, filter_postags=['助詞', '補助記号', '助動詞'])
    return doc.words

## Unigram Counts
unigram_vectorizer = CountVectorizer(ngram_range=(1,1), tokenizer=tokenize_jp)
unigram_vectorizer.fit_transform(tsukuba_train['text'].values)

dump(unigram_vectorizer, 'data_preprocessors/unigram_vectorizer.joblib')
# unigram_vectorizer = load('data_preprocessors/unigram_vectorizer.joblib')

X_train_unigram = unigram_vectorizer.transform(tsukuba_train['text'].values)

save_npz('vectorized_data/X_train_unigram.npz', X_train_unigram)
# X_train_unigram = load_npz('vectorized_data/X_train_unigram.npz')

## Unigram Tf-Idf
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)

dump(unigram_tf_idf_transformer, 'data_preprocessors/unigram_tf_idf_transformer.joblib')
# unigram_tf_idf_transformer = load('data_preprocessors/unigram_tf_idf_transformer.joblib')

X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

save_npz('vectorized_data/X_train_unigram_tf_idf.npz', X_train_unigram_tf_idf)
# X_train_unigram_tf_idf = load_npz('vectorized_data/X_train_unigram_tf_idf.npz')

## Bigram Counts
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize_jp)
bigram_vectorizer.fit(tsukuba_train['text'].values)

dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib')
# bigram_vectorizer = load('data_preprocessors/bigram_vectorizer.joblib')

X_train_bigram = bigram_vectorizer.transform(tsukuba_train['text'].values)

save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram)
# X_train_bigram = load_npz('vectorized_data/X_train_bigram.npz')

## Bigram Tf-Idf
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib')
# bigram_tf_idf_transformer = load('data_preprocessors/bigram_tf_idf_transformer.joblib')

X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf)
# X_train_bigram_tf_idf = load_npz('vectorized_data/X_train_bigram_tf_idf.npz')



# Choosing Data Format

In [46]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

In [47]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = tsukuba_train['label'].values

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 1.0 ; Validation score: 0.74

Unigram Tf-Idf
Train score: 1.0 ; Validation score: 0.75

Bigram Counts
Train score: 1.0 ; Validation score: 0.75

Bigram Tf-Idf
Train score: 1.0 ; Validation score: 0.77



# Using Cross-Validation for hyperparameter tuning

In [48]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [49]:
X_train = X_train_bigram_tf_idf

In [50]:
# Phase 1: loss, learning rate and initial learning rate

clf = SGDClassifier()

distributions = dict(
    loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    learning_rate=['optimal', 'invscaling', 'adaptive'],
    eta0=uniform(loc=1e-7, scale=1e-2)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=50
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'eta0': 0.00995997003866562, 'learning_rate': 'adaptive', 'loss': 'squared_hinge'}
Best score: 0.7715383090494599


In [51]:
# Phase 2: penalty and alpha

clf = SGDClassifier()

distributions = dict(
    penalty=['l1', 'l2', 'elasticnet'],
    alpha=uniform(loc=1e-6, scale=1e-4)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=50
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'alpha': 2.5534132650005583e-05, 'penalty': 'elasticnet'}
Best score: 0.7749260156039817


# Saving the best classifier

In [54]:
sgd_classifier = random_search_cv.best_estimator_

dump(random_search_cv.best_estimator_, 'classifiers/sgd_classifier.joblib')
# sgd_classifier = load('classifiers/sgd_classifier.joblib')

['classifiers/sgd_classifier.joblib']

# Testing model & scoring

In [55]:
X_test = bigram_vectorizer.transform(tsukuba_test['text'].values)
X_test = bigram_tf_idf_transformer.transform(X_test)
y_test = tsukuba_test['label'].values

score = sgd_classifier.score(X_test, y_test)
print(score)

0.772


# Using model to predict

In [59]:
model = load('classifiers/sgd_classifier.joblib')
ozlab_predict = pd.read_csv('csv/ozlab_predict.csv', header=0)
X_predict = bigram_vectorizer.transform(ozlab_predict['text'].values)
X_predict = bigram_tf_idf_transformer.transform(X_predict)
model.predict(X_predict)

array(['p', 'k'], dtype='<U1')