In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the preprocessed data
train_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_training_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_testing_data.csv")

In [4]:
train_df.head()

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq


In [5]:
test_df["Category"].value_counts()

earn            1087
acq              699
crude            144
interest         112
money-fx         112
trade             88
ship              69
wheat             38
sugar             31
money-supply      30
Name: Category, dtype: int64

In [6]:
train_df["Category"].value_counts().head(6)

earn        2850
acq         1613
interest     312
crude        298
trade        289
money-fx     274
Name: Category, dtype: int64

In [7]:
top6_cat = ['earn', 'acq', 'interest', 'crude', 'trade', 'money-fx']

In [8]:
train_df = train_df[train_df['Category'].isin(top6_cat)]
test_df = test_df[test_df['Category'].isin(top6_cat)]

In [9]:
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = train_test_split(train_df["Text"], train_df['Category'], test_size=0.2)


In [10]:
test_x = test_df['Text']

In [11]:
test_y = test_df['Category']

In [12]:
# label encode the target variable
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
valid_y = encoder.fit_transform(valid_y)

## Tf-Idf and XGBoost


In [13]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [14]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [15]:
#Training the model on the training data
model.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10, verbose = True)
# model.fit(xtrain_tfidf, train_y, verbose = True)



[0]	validation_0-mlogloss:1.16610
[1]	validation_0-mlogloss:0.88495
[2]	validation_0-mlogloss:0.70989
[3]	validation_0-mlogloss:0.58379
[4]	validation_0-mlogloss:0.49114
[5]	validation_0-mlogloss:0.42009
[6]	validation_0-mlogloss:0.36765
[7]	validation_0-mlogloss:0.32890
[8]	validation_0-mlogloss:0.30007
[9]	validation_0-mlogloss:0.27659
[10]	validation_0-mlogloss:0.25713
[11]	validation_0-mlogloss:0.24286
[12]	validation_0-mlogloss:0.23174
[13]	validation_0-mlogloss:0.22318
[14]	validation_0-mlogloss:0.21693
[15]	validation_0-mlogloss:0.21077
[16]	validation_0-mlogloss:0.20619
[17]	validation_0-mlogloss:0.20158
[18]	validation_0-mlogloss:0.19904
[19]	validation_0-mlogloss:0.19633
[20]	validation_0-mlogloss:0.19361
[21]	validation_0-mlogloss:0.19143
[22]	validation_0-mlogloss:0.19044
[23]	validation_0-mlogloss:0.18768
[24]	validation_0-mlogloss:0.18713
[25]	validation_0-mlogloss:0.18463
[26]	validation_0-mlogloss:0.18252
[27]	validation_0-mlogloss:0.18106
[28]	validation_0-mlogloss:0.1

In [16]:
#Making predictions on the test set
predictions = model.predict(xtest_tfidf)

In [17]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping

{'acq': 0, 'crude': 1, 'earn': 2, 'interest': 3, 'money-fx': 4, 'trade': 5}

In [18]:
decoded_labels = encoder.inverse_transform(test_y)

In [19]:
#Calculating accuracy
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.9553969669937555

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       699
           1       0.96      0.92      0.94       144
           2       0.97      0.99      0.98      1087
           3       0.84      0.79      0.82       112
           4       0.79      0.79      0.79       112
           5       0.95      0.89      0.92        88

    accuracy                           0.96      2242
   macro avg       0.91      0.89      0.90      2242
weighted avg       0.96      0.96      0.96      2242



Tf-Idf and Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [22]:
lgr_clf.fit(xtrain_tfidf, train_y)

In [23]:
lgr_predictions = lgr_clf.predict(xtest_tfidf)

In [24]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.9500446030330062

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       699
           1       0.98      0.90      0.93       144
           2       0.99      0.98      0.98      1087
           3       0.85      0.79      0.81       112
           4       0.83      0.70      0.76       112
           5       0.94      0.92      0.93        88

    accuracy                           0.95      2242
   macro avg       0.92      0.88      0.90      2242
weighted avg       0.95      0.95      0.95      2242



## Tf-idf and Naive Bayes

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
nb_clf = MultinomialNB(force_alpha=True)

In [38]:
nb_clf.fit(xtrain_tfidf, train_y)

In [39]:
nb_predictions = nb_clf.predict(xtest_tfidf)

In [40]:
accuracy = accuracy_score(test_y, nb_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, nb_predictions))

Accuracy: 0.931757359500446

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       699
           1       1.00      0.78      0.88       144
           2       0.96      0.98      0.97      1087
           3       0.85      0.73      0.79       112
           4       0.83      0.71      0.76       112
           5       0.82      0.74      0.78        88

    accuracy                           0.93      2242
   macro avg       0.90      0.82      0.85      2242
weighted avg       0.93      0.93      0.93      2242



### Count Vectorization

In [42]:
# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_df['Text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)
xvalid_count =  count_vect.transform(valid_x)

### Count vectorizer and XGBoost

In [43]:
model.fit(xtrain_count, train_y, eval_set=[(xvalid_count, valid_y)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.16251
[1]	validation_0-mlogloss:0.88124
[2]	validation_0-mlogloss:0.70165
[3]	validation_0-mlogloss:0.57704
[4]	validation_0-mlogloss:0.48824
[5]	validation_0-mlogloss:0.41981
[6]	validation_0-mlogloss:0.37024
[7]	validation_0-mlogloss:0.32964
[8]	validation_0-mlogloss:0.29753
[9]	validation_0-mlogloss:0.27349
[10]	validation_0-mlogloss:0.25309
[11]	validation_0-mlogloss:0.23873
[12]	validation_0-mlogloss:0.22677
[13]	validation_0-mlogloss:0.21677
[14]	validation_0-mlogloss:0.20686
[15]	validation_0-mlogloss:0.19939
[16]	validation_0-mlogloss:0.19281
[17]	validation_0-mlogloss:0.18862
[18]	validation_0-mlogloss:0.18533
[19]	validation_0-mlogloss:0.18187
[20]	validation_0-mlogloss:0.17847
[21]	validation_0-mlogloss:0.17611
[22]	validation_0-mlogloss:0.17410
[23]	validation_0-mlogloss:0.17303
[24]	validation_0-mlogloss:0.17048
[25]	validation_0-mlogloss:0.16858
[26]	validation_0-mlogloss:0.16723
[27]	validation_0-mlogloss:0.16555
[28]	validation_0-mlogloss:0.1

In [45]:
predictions = model.predict(xtest_count)

In [46]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.9509366636931311

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       699
           1       0.94      0.94      0.94       144
           2       0.98      0.99      0.98      1087
           3       0.81      0.72      0.76       112
           4       0.76      0.78      0.77       112
           5       0.90      0.91      0.90        88

    accuracy                           0.95      2242
   macro avg       0.89      0.88      0.89      2242
weighted avg       0.95      0.95      0.95      2242



### Count vectorization and logistic regression

In [47]:
lgr_clf.fit(xtrain_count, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
lgr_predictions = lgr_clf.predict(xtest_count)

In [50]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.9545049063336307

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       699
           1       0.98      0.92      0.95       144
           2       0.98      0.99      0.99      1087
           3       0.81      0.79      0.80       112
           4       0.74      0.75      0.75       112
           5       0.93      0.92      0.93        88

    accuracy                           0.95      2242
   macro avg       0.90      0.89      0.89      2242
weighted avg       0.95      0.95      0.95      2242



## Count vectorization and Naive Bayes

In [51]:
nb_clf.fit(xtrain_count, train_y)

In [52]:
nb_predictions = nb_clf.predict(xtest_count)

In [53]:
accuracy = accuracy_score(test_y, nb_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, nb_predictions))

Accuracy: 0.9513826940231935

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       699
           1       0.97      0.92      0.95       144
           2       0.99      0.97      0.98      1087
           3       0.85      0.70      0.76       112
           4       0.73      0.84      0.78       112
           5       0.87      0.93      0.90        88

    accuracy                           0.95      2242
   macro avg       0.89      0.89      0.89      2242
weighted avg       0.95      0.95      0.95      2242



### Word2Vec

In [54]:
from gensim.models import Word2Vec

In [55]:
def word2vec_vectorization(data, embedding_dim=100):

    sentences = [doc.split() for doc in data]

    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, min_count=1)

    word_vectors = []
    for doc in sentences:
        doc_vector = np.zeros(embedding_dim)
        word_count = 0
        for word in doc:
            if word in model.wv:
                doc_vector += model.wv[word]
                word_count += 1
        if word_count > 0:
            doc_vector /= word_count
        word_vectors.append(doc_vector)

    return np.array(word_vectors)

In [56]:
X_word2vec_train = word2vec_vectorization(train_x)
X_word2vec_test = word2vec_vectorization(test_x)
X_word2vec_valid = word2vec_vectorization(valid_x)

### Word2Vec and XGBoost

In [57]:
model.fit(X_word2vec_train, train_y, eval_set=[(X_word2vec_valid, valid_y)], early_stopping_rounds=10, verbose = True)

[0]	validation_0-mlogloss:1.85241




[1]	validation_0-mlogloss:1.92190
[2]	validation_0-mlogloss:2.02491
[3]	validation_0-mlogloss:2.09128
[4]	validation_0-mlogloss:2.15438
[5]	validation_0-mlogloss:2.20602
[6]	validation_0-mlogloss:2.21691
[7]	validation_0-mlogloss:2.24301
[8]	validation_0-mlogloss:2.29426
[9]	validation_0-mlogloss:2.36325


In [58]:
predictions = model.predict(X_word2vec_test)

In [59]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.19937555753791258

Classification Report:
              precision    recall  f1-score   support

           0       0.04      0.05      0.04       699
           1       0.05      0.27      0.08       144
           2       0.98      0.33      0.50      1087
           3       0.00      0.00      0.00       112
           4       0.67      0.02      0.03       112
           5       0.35      0.09      0.14        88

    accuracy                           0.20      2242
   macro avg       0.35      0.13      0.13      2242
weighted avg       0.54      0.20      0.27      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Word2Vec and Logistic Regression

In [60]:
lgr_clf.fit(X_word2vec_train, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
lgr_predictions = lgr_clf.predict(X_word2vec_test)

In [62]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.3519179304192685

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.99      0.58       699
           1       0.31      0.24      0.27       144
           2       0.64      0.01      0.01      1087
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00       112
           5       0.12      0.64      0.21        88

    accuracy                           0.35      2242
   macro avg       0.25      0.31      0.18      2242
weighted avg       0.46      0.35      0.21      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Word2Vec and Naive Bayes

In [63]:
nb_clf.fit(X_word2vec_train, train_y)

ValueError: ignored

In [None]:
nb_predictions = nb_clf.predict(xtest_count)

In [None]:
accuracy = accuracy_score(test_y, nb_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, nb_predictions))