In [12]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [13]:
import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time
path_to_datafolder = 'C:/Users/mjdom/source/repos/mdst_nlp_2021/data'
print(os.listdir(path_to_datafolder))

['test.csv', 'train.csv']


In [14]:
df = pd.read_csv(path_to_datafolder+'/train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [15]:
X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

y_one_vector = []
for author in authors:
    if author == "EAP":
        y_one_vector.append(0)
    if author == "HPL":
        y_one_vector.append(1)
    if author == "MWS":
        y_one_vector.append(2)

y_one_vector = np.array(y_one_vector)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Convert text to Td-Idf encoding, sparse tensors to save data

In [105]:
max_features = 1000000
tfidf_vec = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', sparse=True, ngrams=1)
with tf.device('/device:CPU:0'):
    tfidf_vec.adapt(X)
vocab = tfidf_vec.get_vocabulary()

In [106]:
tdidf = tf.keras.Sequential([
    tfidf_vec])

In [107]:
x_train_sparce = tdidf.predict(X_train)
x_test_sparce = tdidf.predict(X_test)
x_test_sparce.shape

TensorShape([3916, 25412])

### Covert Tensorflow Sparse tensor to Scipy sparse matrix

In [108]:
row  = np.array(x_train_sparce.indices[:,0])
col  = np.array(x_train_sparce.indices[:,1])
data = np.array(x_train_sparce.values)
x_xgb_train = scipy.sparse.coo_matrix((data, (row, col)), shape=(x_train_sparce.shape.as_list()))


row_test  = np.array(x_test_sparce.indices[:,0])
col_test  = np.array(x_test_sparce.indices[:,1])
data_test = np.array(x_test_sparce.values)
x_xgb_test = scipy.sparse.coo_matrix((data_test, (row_test, col_test)), shape=(x_test_sparce.shape.as_list()))

### XGBClassifier

In [117]:
eval_set = [(x_xgb_train, np.argmax(y_train,axis = 1)), (x_xgb_test, np.argmax(y_test,axis = 1))]
eval_set = [(x_xgb_train, np.argmax(y_train,axis = 1)), (x_xgb_test, np.argmax(y_test,axis = 1))]

clf = xgb.XGBClassifier( colsample_bytree = .7,
                        subsample = .8,
                        learning_rate = 0.5,
                        max_depth = 3,
                        num_class =3,
                        objective ='multi:softprob',

                        n_estimators =2000,)

clf.fit(x_xgb_train, np.argmax(y_train,axis = 1),  early_stopping_rounds=50, eval_metric=[ "mlogloss"], eval_set=eval_set,verbose=200)

[0]	validation_0-mlogloss:1.04892	validation_1-mlogloss:1.05066




[200]	validation_0-mlogloss:0.48433	validation_1-mlogloss:0.60713
[400]	validation_0-mlogloss:0.36113	validation_1-mlogloss:0.54313
[600]	validation_0-mlogloss:0.28897	validation_1-mlogloss:0.51552
[800]	validation_0-mlogloss:0.24070	validation_1-mlogloss:0.50595
[864]	validation_0-mlogloss:0.22870	validation_1-mlogloss:0.50653


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=2000, n_jobs=12, num_class=3, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [118]:
clf.predict_proba(x_xgb_test)

array([[0.9784038 , 0.00949988, 0.01209638],
       [0.58568215, 0.07392627, 0.34039158],
       [0.46202946, 0.43839046, 0.09958004],
       ...,
       [0.00837678, 0.9861479 , 0.00547531],
       [0.03780356, 0.96076554, 0.00143095],
       [0.7275311 , 0.21573621, 0.05673267]], dtype=float32)

In [119]:
from sklearn.metrics import log_loss
print('log loss:',log_loss(y_test,clf.predict_proba(x_xgb_test)))

log loss: 0.5051460360975915


In [120]:
print('accuracy',np.sum(np.argmax(clf.predict_proba(x_xgb_test), axis = 1)== np.argmax(y_test,axis = 1))/len(np.argmax(y_test,axis = 1)))

accuracy 0.7967313585291114
