In [9]:
import gc
import os
import numpy as np
import pandas as pd
from time import time
from pprint import pprint
from sklearn.svm import LinearSVC
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# ============================================================================
# Load data

print("Loading data...")
data_file = "../raw_data/train_demo.csv"
df_data = pd.read_csv(data_file)

X_text = df_data['word_seg']  # words of samples (documents)
y = df_data['class']  # labels (1 ~ 19)
num_classes = max(y)
print("The number of samples is: %d" % len(X_text))
print("The number of classes is: %d" % num_classes)

del df_data
gc.collect()

Loading data...
The number of samples is: 4999
The number of classes is: 19


7

In [3]:
# ============================================================================
# Load character/word embedding

word_embed_file = "../processed_data/train-data-300d-mean.txt"
word_embed = pd.read_csv(word_embed_file).drop(['class'], axis=1)

In [5]:
# ============================================================================
# Extract TF-IDF features

vect_params = {
    'ngram_range': (1, 2),
    'min_df': 5,
    'max_df': 0.9,
    'max_features': 100,
    'sublinear_tf': True
}
vectorizer = TfidfVectorizer(**vect_params)
print("Vectorizer's hyper-parameters:")
pprint(vect_params)

print("Extract features...")
t0_extract = time()
X = vectorizer.fit_transform(X_text)
print("Done in %.3f seconds" % (time() - t0_extract))
print("Extract finished! ( ^ _ ^ ) V")

Vectorizer's hyper-parameters:
{'max_df': 0.9,
 'max_features': 100,
 'min_df': 5,
 'ngram_range': (1, 2),
 'sublinear_tf': True}
Extract features...
Done in 16.600 seconds
Extract finished! ( ^ _ ^ ) V


In [10]:
# ============================================================================
# Concatenate TF-IDF features and embedding features

X = hstack([X, csr_matrix(word_embed)], format='csr')

In [13]:
X.toarray()

array([[ 0.0932443 ,  0.        ,  0.22077235, ..., -0.04485132,
         0.14054012,  0.04720277],
       [ 0.12778936,  0.11224537,  0.1177972 , ..., -0.10544113,
         0.02626771, -0.01324177],
       [ 0.        ,  0.10095127,  0.03691131, ..., -0.12735154,
        -0.07328191,  0.11236589],
       ...,
       [ 0.08160263,  0.1013473 ,  0.17281348, ..., -0.09613179,
        -0.00730957,  0.09377087],
       [ 0.07481859,  0.07473134,  0.10008863, ...,  0.06953936,
         0.00994381, -0.06601639],
       [ 0.18337967,  0.14777703,  0.23950951, ..., -0.0775608 ,
        -0.05436005,  0.11374065]])

In [7]:
# ============================================================================
# Train the SVM model

print("Split data into training and validation set...")
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

print("Start training...")
clf = LinearSVC()
t0_train = time()
clf.fit(X_train, y_train - 1)  # labels must be in (0, 18)
print("Done in %.3f seconds" % (time() - t0_train))
print("Training finish! ( ^ _ ^ ) V ")

pred_train = clf.predict(X_train) + 1
pred_val = clf.predict(X_val) + 1
acc_train = accuracy_score(y_train, pred_train)
acc_val = accuracy_score(y_val, pred_val)
f1_train = f1_score(y_train, pred_train, average='weighted')
f1_val = f1_score(y_val, pred_val, average='weighted')
print("Train Accuracy: %.2f, Validate Accuracy: %.2f" % (acc_train * 100, acc_val * 100))
print("Train F1 Score: %.5f, Validate F1 Score: %.5f" % (f1_train, f1_val))

Split data into training and validation set...
Start training...
Done in 22.287 seconds
Training finish! ( ^ _ ^ ) V 
Train Accuracy: 99.97, Validate Accuracy: 69.10
Train F1 Score: 0.99975, Validate F1 Score: 0.68977
