In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the preprocessed data
train_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_training_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_testing_data.csv")

In [4]:
train_df.head()

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq


In [5]:
test_df["Category"].value_counts()

earn            1087
acq              699
crude            144
interest         112
money-fx         112
trade             88
ship              69
wheat             38
sugar             31
money-supply      30
Name: Category, dtype: int64

In [6]:
train_df["Category"].value_counts()

earn            2850
acq             1613
interest         312
crude            298
trade            289
money-fx         274
ship             167
money-supply     130
wheat            111
sugar            105
Name: Category, dtype: int64

In [7]:
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = train_test_split(train_df["Text"], train_df['Category'], test_size=0.2)


In [8]:
test_x = test_df['Text']

In [9]:
test_y = test_df['Category']

In [10]:
# label encode the target variable
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
valid_y = encoder.fit_transform(valid_y)

## Tf-Idf and XGBoost

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [12]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [13]:
#Training the model on the training data
model.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10, verbose = True)
# model.fit(xtrain_tfidf, train_y, verbose = True)



[0]	validation_0-mlogloss:1.29611
[1]	validation_0-mlogloss:0.99402
[2]	validation_0-mlogloss:0.80330
[3]	validation_0-mlogloss:0.66949
[4]	validation_0-mlogloss:0.56963
[5]	validation_0-mlogloss:0.49722
[6]	validation_0-mlogloss:0.43979
[7]	validation_0-mlogloss:0.39673
[8]	validation_0-mlogloss:0.36433
[9]	validation_0-mlogloss:0.33786
[10]	validation_0-mlogloss:0.31787
[11]	validation_0-mlogloss:0.29965
[12]	validation_0-mlogloss:0.28414
[13]	validation_0-mlogloss:0.27382
[14]	validation_0-mlogloss:0.26603
[15]	validation_0-mlogloss:0.25960
[16]	validation_0-mlogloss:0.25443
[17]	validation_0-mlogloss:0.24903
[18]	validation_0-mlogloss:0.24694
[19]	validation_0-mlogloss:0.24339
[20]	validation_0-mlogloss:0.24000
[21]	validation_0-mlogloss:0.23769
[22]	validation_0-mlogloss:0.23675
[23]	validation_0-mlogloss:0.23539
[24]	validation_0-mlogloss:0.23374
[25]	validation_0-mlogloss:0.23314
[26]	validation_0-mlogloss:0.23331
[27]	validation_0-mlogloss:0.23264
[28]	validation_0-mlogloss:0.2

In [14]:
#Making predictions on the test set
predictions = model.predict(xtest_tfidf)

In [15]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping

{'acq': 0,
 'crude': 1,
 'earn': 2,
 'interest': 3,
 'money-fx': 4,
 'money-supply': 5,
 'ship': 6,
 'sugar': 7,
 'trade': 8,
 'wheat': 9}

In [16]:
decoded_labels = encoder.inverse_transform(test_y)

In [17]:
#Calculating accuracy
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.9381742738589212

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       699
           1       0.87      0.87      0.87       144
           2       0.97      0.99      0.98      1087
           3       0.83      0.83      0.83       112
           4       0.85      0.78      0.81       112
           5       0.85      0.77      0.81        30
           6       0.81      0.70      0.75        69
           7       0.93      0.87      0.90        31
           8       0.89      0.89      0.89        88
           9       0.97      0.95      0.96        38

    accuracy                           0.94      2410
   macro avg       0.89      0.86      0.87      2410
weighted avg       0.94      0.94      0.94      2410



Tf-Idf and Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [20]:
lgr_clf.fit(xtrain_tfidf, train_y)

In [21]:
lgr_predictions = lgr_clf.predict(xtest_tfidf)

In [22]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.9294605809128631

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       699
           1       0.84      0.84      0.84       144
           2       0.98      0.98      0.98      1087
           3       0.82      0.77      0.79       112
           4       0.79      0.72      0.75       112
           5       0.96      0.73      0.83        30
           6       0.81      0.57      0.67        69
           7       0.92      0.77      0.84        31
           8       0.91      0.92      0.92        88
           9       1.00      0.84      0.91        38

    accuracy                           0.93      2410
   macro avg       0.89      0.81      0.85      2410
weighted avg       0.93      0.93      0.93      2410

