In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the preprocessed data
train_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_training_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_testing_data.csv")

In [4]:
train_df.head()

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq


In [5]:
test_df["Category"].value_counts()

earn            1087
acq              699
crude            144
interest         112
money-fx         112
trade             88
ship              69
wheat             38
sugar             31
money-supply      30
Name: Category, dtype: int64

In [6]:
train_df["Category"].value_counts().head(6)

earn        2850
acq         1613
interest     312
crude        298
trade        289
money-fx     274
Name: Category, dtype: int64

In [7]:
top6_cat = ['earn', 'acq', 'interest', 'crude', 'trade', 'money-fx']

In [8]:
train_df = train_df[train_df['Category'].isin(top6_cat)]
test_df = test_df[test_df['Category'].isin(top6_cat)]

In [9]:
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = train_test_split(train_df["Text"], train_df['Category'], test_size=0.2)


In [10]:
test_x = test_df['Text']

In [11]:
test_y = test_df['Category']

In [12]:
# label encode the target variable
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
valid_y = encoder.fit_transform(valid_y)

### Tf-Idf

In [13]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

## Tf-Idf and XGBoost


In [14]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [15]:
#Training the model on the training data
model.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10, verbose = True)
# model.fit(xtrain_tfidf, train_y, verbose = True)



[0]	validation_0-mlogloss:1.16859
[1]	validation_0-mlogloss:0.88607
[2]	validation_0-mlogloss:0.71073
[3]	validation_0-mlogloss:0.58816
[4]	validation_0-mlogloss:0.49586
[5]	validation_0-mlogloss:0.42680
[6]	validation_0-mlogloss:0.37717
[7]	validation_0-mlogloss:0.33821
[8]	validation_0-mlogloss:0.30792
[9]	validation_0-mlogloss:0.28453
[10]	validation_0-mlogloss:0.26504
[11]	validation_0-mlogloss:0.24913
[12]	validation_0-mlogloss:0.23763
[13]	validation_0-mlogloss:0.22865
[14]	validation_0-mlogloss:0.22085
[15]	validation_0-mlogloss:0.21393
[16]	validation_0-mlogloss:0.20847
[17]	validation_0-mlogloss:0.20485
[18]	validation_0-mlogloss:0.20120
[19]	validation_0-mlogloss:0.19864
[20]	validation_0-mlogloss:0.19660
[21]	validation_0-mlogloss:0.19456
[22]	validation_0-mlogloss:0.19224
[23]	validation_0-mlogloss:0.19100
[24]	validation_0-mlogloss:0.18976
[25]	validation_0-mlogloss:0.18784
[26]	validation_0-mlogloss:0.18709
[27]	validation_0-mlogloss:0.18609
[28]	validation_0-mlogloss:0.1

In [16]:
#Making predictions on the test set
predictions = model.predict(xtest_tfidf)

In [17]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping

{'acq': 0, 'crude': 1, 'earn': 2, 'interest': 3, 'money-fx': 4, 'trade': 5}

In [18]:
decoded_labels = encoder.inverse_transform(test_y)

In [19]:
#Calculating accuracy
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.9451382694023194

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       699
           1       0.93      0.92      0.93       144
           2       0.98      0.98      0.98      1087
           3       0.79      0.78      0.78       112
           4       0.75      0.73      0.74       112
           5       0.94      0.84      0.89        88

    accuracy                           0.95      2242
   macro avg       0.89      0.87      0.88      2242
weighted avg       0.94      0.95      0.94      2242



Tf-Idf and Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [22]:
lgr_clf.fit(xtrain_tfidf, train_y)

In [23]:
lgr_predictions = lgr_clf.predict(xtest_tfidf)

In [24]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.9513826940231935

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       699
           1       0.97      0.90      0.93       144
           2       0.99      0.98      0.98      1087
           3       0.85      0.78      0.81       112
           4       0.83      0.73      0.78       112
           5       0.94      0.92      0.93        88

    accuracy                           0.95      2242
   macro avg       0.92      0.88      0.90      2242
weighted avg       0.95      0.95      0.95      2242



## Tf-idf and Easy Ensemble

In [25]:
from imblearn.ensemble import EasyEnsembleClassifier

In [26]:
ensemble_clf = EasyEnsembleClassifier(random_state=42)

In [27]:
ensemble_clf.fit(xtrain_tfidf, train_y)

In [28]:
ensemble_predictions = ensemble_clf.predict(xtest_tfidf)

In [29]:
accuracy = accuracy_score(test_y, ensemble_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, ensemble_predictions))

Accuracy: 0.9099018733273863

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93       699
           1       0.91      0.90      0.91       144
           2       1.00      0.92      0.96      1087
           3       0.71      0.73      0.72       112
           4       0.57      0.77      0.65       112
           5       0.84      0.88      0.86        88

    accuracy                           0.91      2242
   macro avg       0.82      0.86      0.84      2242
weighted avg       0.92      0.91      0.91      2242



### Count Vectorization

In [30]:
# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_df['Text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)
xvalid_count =  count_vect.transform(valid_x)

### Count vectorizer and XGBoost

In [31]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [32]:
model.fit(xtrain_count, train_y, eval_set=[(xvalid_count, valid_y)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.17887
[1]	validation_0-mlogloss:0.89636
[2]	validation_0-mlogloss:0.71797
[3]	validation_0-mlogloss:0.59478
[4]	validation_0-mlogloss:0.50138
[5]	validation_0-mlogloss:0.43455
[6]	validation_0-mlogloss:0.38296
[7]	validation_0-mlogloss:0.34291
[8]	validation_0-mlogloss:0.31298
[9]	validation_0-mlogloss:0.28968
[10]	validation_0-mlogloss:0.26893
[11]	validation_0-mlogloss:0.25316
[12]	validation_0-mlogloss:0.24249
[13]	validation_0-mlogloss:0.23057
[14]	validation_0-mlogloss:0.22217
[15]	validation_0-mlogloss:0.21685
[16]	validation_0-mlogloss:0.21045
[17]	validation_0-mlogloss:0.20682
[18]	validation_0-mlogloss:0.20401
[19]	validation_0-mlogloss:0.20174
[20]	validation_0-mlogloss:0.19957
[21]	validation_0-mlogloss:0.19748
[22]	validation_0-mlogloss:0.19592
[23]	validation_0-mlogloss:0.19599
[24]	validation_0-mlogloss:0.19442
[25]	validation_0-mlogloss:0.19299
[26]	validation_0-mlogloss:0.19163
[27]	validation_0-mlogloss:0.19048
[28]	validation_0-mlogloss:0.1

In [33]:
predictions = model.predict(xtest_count)

In [34]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.9451382694023194

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       699
           1       0.93      0.94      0.93       144
           2       0.98      0.99      0.98      1087
           3       0.78      0.73      0.76       112
           4       0.72      0.73      0.73       112
           5       0.91      0.84      0.88        88

    accuracy                           0.95      2242
   macro avg       0.88      0.87      0.87      2242
weighted avg       0.94      0.95      0.94      2242



### Count vectorization and logistic regression

In [35]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [36]:
lgr_clf.fit(xtrain_count, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
lgr_predictions = lgr_clf.predict(xtest_count)

In [38]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.9522747546833185

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       699
           1       0.98      0.92      0.95       144
           2       0.98      0.99      0.98      1087
           3       0.79      0.76      0.77       112
           4       0.73      0.73      0.73       112
           5       0.94      0.92      0.93        88

    accuracy                           0.95      2242
   macro avg       0.90      0.88      0.89      2242
weighted avg       0.95      0.95      0.95      2242



## Count vectorization and Naive Bayes

In [39]:
ensemble_clf = EasyEnsembleClassifier(random_state=42)

In [40]:
ensemble_clf.fit(xtrain_count, train_y)

In [41]:
ensemble_predictions = ensemble_clf.predict(xtest_count)

In [42]:
accuracy = accuracy_score(test_y, ensemble_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, ensemble_predictions))

Accuracy: 0.9085637823371989

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       699
           1       0.82      0.93      0.87       144
           2       0.99      0.93      0.96      1087
           3       0.72      0.68      0.70       112
           4       0.69      0.71      0.70       112
           5       0.77      0.93      0.85        88

    accuracy                           0.91      2242
   macro avg       0.81      0.85      0.83      2242
weighted avg       0.91      0.91      0.91      2242



### Word2Vec

In [43]:
from gensim.models import Word2Vec

In [44]:
def word2vec_vectorization(data, embedding_dim=100):

    sentences = [doc.split() for doc in data]

    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, min_count=1)

    word_vectors = []
    for doc in sentences:
        doc_vector = np.zeros(embedding_dim)
        word_count = 0
        for word in doc:
            if word in model.wv:
                doc_vector += model.wv[word]
                word_count += 1
        if word_count > 0:
            doc_vector /= word_count
        word_vectors.append(doc_vector)

    return np.array(word_vectors)

In [45]:
X_word2vec_train = word2vec_vectorization(train_x)
X_word2vec_test = word2vec_vectorization(test_x)
X_word2vec_valid = word2vec_vectorization(valid_x)

### Word2Vec and XGBoost

In [46]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [47]:
model.fit(X_word2vec_train, train_y, eval_set=[(X_word2vec_valid, valid_y)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.69707
[1]	validation_0-mlogloss:1.56428
[2]	validation_0-mlogloss:1.44636
[3]	validation_0-mlogloss:1.38549
[4]	validation_0-mlogloss:1.29303
[5]	validation_0-mlogloss:1.28203
[6]	validation_0-mlogloss:1.23458
[7]	validation_0-mlogloss:1.17799
[8]	validation_0-mlogloss:1.15811
[9]	validation_0-mlogloss:1.14904
[10]	validation_0-mlogloss:1.13416
[11]	validation_0-mlogloss:1.12306
[12]	validation_0-mlogloss:1.10905
[13]	validation_0-mlogloss:1.09499
[14]	validation_0-mlogloss:1.07179
[15]	validation_0-mlogloss:1.05173
[16]	validation_0-mlogloss:1.05339
[17]	validation_0-mlogloss:1.04996
[18]	validation_0-mlogloss:1.04985
[19]	validation_0-mlogloss:1.05011
[20]	validation_0-mlogloss:1.05637
[21]	validation_0-mlogloss:1.06076
[22]	validation_0-mlogloss:1.07588
[23]	validation_0-mlogloss:1.07990
[24]	validation_0-mlogloss:1.07953
[25]	validation_0-mlogloss:1.07865
[26]	validation_0-mlogloss:1.08612
[27]	validation_0-mlogloss:1.08383


In [48]:
predictions = model.predict(X_word2vec_test)

In [49]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.3144513826940232

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.99      0.69       699
           1       0.00      0.00      0.00       144
           2       0.30      0.01      0.02      1087
           3       0.10      0.01      0.02       112
           4       0.00      0.00      0.00       112
           5       0.00      0.03      0.01        88

    accuracy                           0.31      2242
   macro avg       0.16      0.17      0.12      2242
weighted avg       0.32      0.31      0.22      2242



### Word2Vec and Logistic Regression

In [50]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [51]:
lgr_clf.fit(X_word2vec_train, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
lgr_predictions = lgr_clf.predict(X_word2vec_test)

In [53]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.05798394290811775

Classification Report:
              precision    recall  f1-score   support

           0       0.01      0.02      0.01       699
           1       0.00      0.00      0.00       144
           2       0.09      0.11      0.10      1087
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00       112
           5       0.00      0.00      0.00        88

    accuracy                           0.06      2242
   macro avg       0.02      0.02      0.02      2242
weighted avg       0.05      0.06      0.05      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Word2Vec and Easy Ensemble

In [54]:
ensemble_clf = EasyEnsembleClassifier(random_state=42)

In [55]:
ensemble_clf.fit(X_word2vec_train, train_y)

In [56]:
ensemble_predictions = ensemble_clf.predict(X_word2vec_test)

In [57]:
accuracy = accuracy_score(test_y, ensemble_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, ensemble_predictions))

Accuracy: 0.6971454058876003

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.69      0.77       699
           1       0.04      0.01      0.01       144
           2       0.70      0.96      0.81      1087
           3       0.00      0.00      0.00       112
           4       0.21      0.28      0.24       112
           5       0.15      0.06      0.08        88

    accuracy                           0.70      2242
   macro avg       0.33      0.33      0.32      2242
weighted avg       0.64      0.70      0.65      2242



### Doc2Vec

In [58]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Word2Vec, KeyedVectors, Doc2Vec

In [59]:
# Doc2Vec Vectorization
def doc2vec_vectorization(data, vector_size=100):

    model = Doc2Vec(data, vector_size=vector_size, window=4, min_count=2, workers=4, epochs=40)

    doc_vectors = [model.dv[i] for i in range(len(data))]

    return np.array(doc_vectors)

In [60]:
documents_train = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(train_x)]
documents_valid = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(valid_x)]
documents_test = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(test_x)]

In [61]:
X_doc2vec_train = doc2vec_vectorization(documents_train)
X_doc2vec_valid = doc2vec_vectorization(documents_valid)
X_doc2vec_test = doc2vec_vectorization(documents_test)

### Doc2Vec and XGBoost

In [62]:
#Creating an XGBoost classifier
model = xgb.XGBClassifier(random_state = 42)

In [63]:
model.fit(X_doc2vec_train, train_y, eval_set=[(X_doc2vec_valid, valid_y)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.48795
[1]	validation_0-mlogloss:1.36736
[2]	validation_0-mlogloss:1.28808
[3]	validation_0-mlogloss:1.23994
[4]	validation_0-mlogloss:1.22341
[5]	validation_0-mlogloss:1.21209
[6]	validation_0-mlogloss:1.21221
[7]	validation_0-mlogloss:1.22340
[8]	validation_0-mlogloss:1.21227
[9]	validation_0-mlogloss:1.22381
[10]	validation_0-mlogloss:1.24326
[11]	validation_0-mlogloss:1.25940
[12]	validation_0-mlogloss:1.27322
[13]	validation_0-mlogloss:1.29495
[14]	validation_0-mlogloss:1.30591
[15]	validation_0-mlogloss:1.33175


In [64]:
predictions = model.predict(X_doc2vec_test)

In [65]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.5927743086529884

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.40      0.50       699
           1       0.18      0.06      0.09       144
           2       0.61      0.96      0.74      1087
           3       0.00      0.00      0.00       112
           4       0.09      0.04      0.05       112
           5       0.00      0.00      0.00        88

    accuracy                           0.59      2242
   macro avg       0.26      0.24      0.23      2242
weighted avg       0.52      0.59      0.52      2242



### Doc2Vec and Logistic Regression

In [66]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [67]:
lgr_clf.fit(X_doc2vec_train, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
lgr_predictions = lgr_clf.predict(X_doc2vec_test)

In [69]:
accuracy = accuracy_score(test_y, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, lgr_predictions))

Accuracy: 0.567350579839429

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.35      0.47       699
           1       0.15      0.02      0.04       144
           2       0.57      0.94      0.71      1087
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00       112
           5       0.00      0.00      0.00        88

    accuracy                           0.57      2242
   macro avg       0.24      0.22      0.20      2242
weighted avg       0.51      0.57      0.49      2242



### Doc2Vec and Easy Ensemble

In [70]:
ensemble_clf = EasyEnsembleClassifier(random_state=42)

In [71]:
ensemble_clf.fit(X_doc2vec_train, train_y)

In [72]:
ensemble_predictions = ensemble_clf.predict(X_doc2vec_test)

In [73]:
accuracy = accuracy_score(test_y, ensemble_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, ensemble_predictions))

Accuracy: 0.56021409455843

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.32      0.41       699
           1       0.19      0.12      0.14       144
           2       0.62      0.92      0.74      1087
           3       0.10      0.07      0.08       112
           4       0.00      0.00      0.00       112
           5       0.12      0.01      0.02        88

    accuracy                           0.56      2242
   macro avg       0.26      0.24      0.23      2242
weighted avg       0.49      0.56      0.50      2242

