In [None]:
import spacy
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index


In [None]:
# Load the stop words and lemmatizing model
en_model = spacy.load('en_core_web_sm')
stopwords = en_model.Defaults.stop_words
wnl = WordNetLemmatizer()

In [None]:
# create new token list lemmatized, rid of stop words and lower cased
with open('token_list.txt') as f:
  with open('processed_token_list.txt','w') as w:
    for line in f:
      tokens = [token.strip() for token in line.split(',')]
      clean_tokens = [wnl.lemmatize(tokens[i]).lower() for i in range(len(tokens)-1) if wnl.lemmatize(tokens[i]) not in stopwords]
      w.write(','.join(clean_tokens)+'\n')
      

# Load Data from Processed Tokens List

In [None]:
token_list_path = '/content/processed_token_list.txt'
data = []
with open(token_list_path) as f:
  for line in f:
    tokens = [token.strip() for token in line.split(',')]
    data.append(" ".join(tokens[:-1]))
print(data)

In [None]:
len(data)

716

In [None]:
# Load Training Labels
labels = []
label_file = 'labels.txt'
with open(label_file) as f:
  for label in f:
    labels.append(int(label.strip()))
f.close()

# Process the data according to XGBoost's needs

In [None]:
# Vectorizing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary = True)
cv.fit(data)
data_vectorized = cv.transform(data)
data_vectorized

<716x7165 sparse matrix of type '<class 'numpy.int64'>'
	with 38482 stored elements in Compressed Sparse Row format>

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
data_resampled, label_resampled = smote.fit_resample(data_vectorized, labels)

In [None]:
# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_resampled, label_resampled, test_size=0.1, random_state=42)

In [None]:
# creating a variable for the new train and test sets
import xgboost as xgb
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)

# Model

In [None]:
import numpy as np
num_classes = len(np.unique(labels))
param = {'eta': 0.65,
         'max_depth': 70,
         'objective': 'multi:softmax',
         'num_class': num_classes}

xgb_model = xgb.train(param, xgb_train, num_boost_round = 40)

In [None]:
y_pred = xgb_model.predict(xgb_test)
print(len(y_pred))
print(y_pred[:10])

145
[5. 2. 0. 8. 7. 4. 4. 2. 4. 6.]


In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
print("XGBoost Performance")
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("F1 score:     ",f1_score(y_test, y_pred, average='weighted'))
print("precision", precision_score(y_test, y_pred, average='weighted'))
print("recall", recall_score(y_test, y_pred, average='weighted'))

XGBoost Performance
Test accuracy: 0.6896551724137931
F1 score:      0.6915160100961318
precision 0.7183750428070915
recall 0.6896551724137931
