In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the preprocessed data
train_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_training_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_test_data.csv")

In [4]:
train_df.head()

Unnamed: 0,Text,Category,WordCount,TokenCount
0,bahia cocoa review shower continu throughout w...,trade,486,526
1,dean food df see strong 4th qtr earn dean food...,acq,233,269
2,magma lower copper 075 cent 66 ct magma copper...,interest,27,31
3,januari hous sale drop realti group say sale p...,earn,92,104
4,asset money market mutual fund rose 7204 mln d...,earn,13,13


In [5]:
test_df["Category"].value_counts()

earn        1087
acq          699
crude        144
interest     112
money-fx     112
trade         88
Name: Category, dtype: int64

In [6]:
train_df["Category"].value_counts()

earn        2923
acq         1682
crude        356
interest     287
trade        241
money-fx     221
Name: Category, dtype: int64

In [7]:
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = train_test_split(train_df["Text"], train_df['Category'], test_size=0.2)


In [8]:
test_x = test_df['Text']

In [9]:
test_y = test_df['Category']

In [10]:
# label encode the target variable
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
valid_y = encoder.fit_transform(valid_y)

## Tf-Idf and XGBoost

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [12]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier()

In [13]:
#Training the model on the training data
model.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.60569
[1]	validation_0-mlogloss:1.49838
[2]	validation_0-mlogloss:1.43092
[3]	validation_0-mlogloss:1.38891
[4]	validation_0-mlogloss:1.35763
[5]	validation_0-mlogloss:1.33535
[6]	validation_0-mlogloss:1.32172
[7]	validation_0-mlogloss:1.31398
[8]	validation_0-mlogloss:1.30740
[9]	validation_0-mlogloss:1.30468
[10]	validation_0-mlogloss:1.30300
[11]	validation_0-mlogloss:1.30261
[12]	validation_0-mlogloss:1.30361
[13]	validation_0-mlogloss:1.30522
[14]	validation_0-mlogloss:1.30388
[15]	validation_0-mlogloss:1.30324
[16]	validation_0-mlogloss:1.30505
[17]	validation_0-mlogloss:1.30469
[18]	validation_0-mlogloss:1.30495
[19]	validation_0-mlogloss:1.30612
[20]	validation_0-mlogloss:1.30694
[21]	validation_0-mlogloss:1.30902


In [14]:
#Making predictions on the test set
predictions = model.predict(xtest_tfidf)

In [15]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping

{'acq': 0, 'crude': 1, 'earn': 2, 'interest': 3, 'money-fx': 4, 'trade': 5}

In [16]:
decoded_labels = encoder.inverse_transform(test_y)

In [17]:
#Calculating accuracy
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.47769848349687777

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.05      0.09       699
           1       0.00      0.00      0.00       144
           2       0.49      0.95      0.64      1087
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00       112
           5       0.00      0.00      0.00        88

    accuracy                           0.48      2242
   macro avg       0.14      0.17      0.12      2242
weighted avg       0.34      0.48      0.34      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn.linear_model import LogisticRegression

In [24]:
lgr_clf = LogisticRegression(verbose=2, random_state=0)

In [25]:
lgr_clf.fit(xtrain_tfidf, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
lgr_predictions = lgr_clf.predict(xtest_tfidf)

In [27]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.45227475468331846

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.12      0.16       699
           1       0.00      0.00      0.00       144
           2       0.48      0.86      0.62      1087
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00       112
           5       0.00      0.00      0.00        88

    accuracy                           0.45      2242
   macro avg       0.12      0.16      0.13      2242
weighted avg       0.32      0.45      0.35      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
