In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tensorflow.keras import regularizers
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.neighbors import KNeighborsClassifier 

tf.random.set_seed(10)




In [2]:
train_x_np = np.load('./training_embeddings.npy', allow_pickle=True)
train_y_np = np.load('./training_labels.npy', allow_pickle=True)

test_x_np = np.load('./testing_embeddings.npy', allow_pickle=True)
test_y_np = np.load('./testing_labels.npy', allow_pickle=True)

In [3]:
test_x_np.shape

(2242, 49152)

In [4]:
len(test_y_np)

2242

In [5]:
train_y_np.shape

(5636,)

In [6]:
train_df = pd.read_csv('../../Task_2/preprocessed_training_data.csv')
test_df = pd.read_csv('../../Task_2/preprocessed_testing_data.csv')

In [7]:
labels = {}

for index, i in enumerate(set(train_y_np)):
    labels[i] = index

labels

{'crude': 0, 'acq': 1, 'money-fx': 2, 'earn': 3, 'trade': 4, 'interest': 5}

In [8]:
def label_encode(label):
    return labels[label]

In [9]:
train_y_np

array(['earn', 'acq', 'earn', ..., 'money-fx', 'money-fx', 'money-fx'],
      dtype=object)

In [10]:
train_y_np = [label_encode(i) for i in train_y_np]
test_y_np = [label_encode(i) for i in test_y_np]

In [11]:
x_train = train_x_np
y_train = train_y_np

x_test = test_x_np
y_test = test_y_np

In [12]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

In [13]:
from tensorflow.keras.utils import to_categorical
enc_y_train = to_categorical(y_train)
enc_y_test = to_categorical(y_test)
enc_y_val = to_categorical(y_val)


In [14]:
decoded_y_train = np.argmax(enc_y_train, axis=1)
decoded_y_test = np.argmax(enc_y_test, axis=1)
decoded_y_val = np.argmax(enc_y_val, axis=1)

In [15]:
decoded_y_train

array([1, 3, 3, ..., 1, 1, 3], dtype=int64)

In [16]:
enc_y_train[0]

array([0., 1., 0., 0., 0., 0.], dtype=float32)

In [17]:
# Build the ANN model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(49152,), kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# model.add(Dense(16, activation='relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))

# model.add(Dense(16, activation='relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(len(labels), activation='softmax'))  # 10 class for Reuters dataset

callback = EarlyStopping(monitor='val_loss', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)


# Compile the model
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model with validation data
history = model.fit(x_train, enc_y_train,
                    epochs=100,
                    batch_size=16,
                    validation_data=(x_val, enc_y_val), callbacks=[callback, reduce_lr])





Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


In [18]:
# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, enc_y_test)
print(f'Test accuracy: {test_acc:.4f}')

Test accuracy: 0.7748


In [19]:
predictions = model.predict(x_test)



In [20]:
predictions

array([[0.14271954, 0.3208833 , 0.11673217, 0.06585949, 0.2988992 ,
        0.05490634],
       [0.11779096, 0.4353109 , 0.20825936, 0.01944799, 0.13163508,
        0.08755577],
       [0.02110535, 0.12020917, 0.36543238, 0.0171214 , 0.06182318,
        0.41430855],
       ...,
       [0.12892854, 0.0389391 , 0.02580668, 0.5890643 , 0.16818547,
        0.04907589],
       [0.28337353, 0.44071782, 0.02488041, 0.12963276, 0.10956804,
        0.01182736],
       [0.04023114, 0.60073566, 0.03261947, 0.2626322 , 0.01922174,
        0.04455986]], dtype=float32)

In [21]:
predictions = np.argmax(predictions, axis=1)

In [22]:
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.7747546833184656

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.24      0.35       144
           1       0.71      0.81      0.75       699
           2       0.31      0.13      0.19       112
           3       0.88      0.96      0.92      1087
           4       0.51      0.38      0.43        88
           5       0.55      0.45      0.49       112

    accuracy                           0.77      2242
   macro avg       0.60      0.49      0.52      2242
weighted avg       0.75      0.77      0.75      2242



In [23]:
model = xgb.XGBClassifier(random_state = 42)

In [24]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=10, verbose = True)



[0]	validation_0-mlogloss:1.33163
[1]	validation_0-mlogloss:1.11445
[2]	validation_0-mlogloss:0.97370
[3]	validation_0-mlogloss:0.87473
[4]	validation_0-mlogloss:0.80470
[5]	validation_0-mlogloss:0.74945
[6]	validation_0-mlogloss:0.70936
[7]	validation_0-mlogloss:0.68041
[8]	validation_0-mlogloss:0.65596
[9]	validation_0-mlogloss:0.63522
[10]	validation_0-mlogloss:0.62241
[11]	validation_0-mlogloss:0.60945
[12]	validation_0-mlogloss:0.59981
[13]	validation_0-mlogloss:0.59123
[14]	validation_0-mlogloss:0.58725
[15]	validation_0-mlogloss:0.58267
[16]	validation_0-mlogloss:0.57929
[17]	validation_0-mlogloss:0.57637
[18]	validation_0-mlogloss:0.57335
[19]	validation_0-mlogloss:0.57178
[20]	validation_0-mlogloss:0.57216
[21]	validation_0-mlogloss:0.57032
[22]	validation_0-mlogloss:0.56998
[23]	validation_0-mlogloss:0.56916
[24]	validation_0-mlogloss:0.56715
[25]	validation_0-mlogloss:0.56782
[26]	validation_0-mlogloss:0.56856
[27]	validation_0-mlogloss:0.56715
[28]	validation_0-mlogloss:0.5

In [25]:
predictions = model.predict(x_test)

In [26]:
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.8050847457627118

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.33      0.44       144
           1       0.75      0.85      0.80       699
           2       0.50      0.27      0.35       112
           3       0.88      0.96      0.92      1087
           4       0.62      0.39      0.48        88
           5       0.71      0.49      0.58       112

    accuracy                           0.81      2242
   macro avg       0.69      0.55      0.59      2242
weighted avg       0.79      0.81      0.79      2242



In [27]:
lgr_clf = LogisticRegression(multi_class='multinomial', verbose=2, random_state=0)

In [28]:
lgr_clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
lgr_predictions = lgr_clf.predict(x_test)

In [30]:
accuracy = accuracy_score(y_test, lgr_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, lgr_predictions))

Accuracy: 0.7261373773416593

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       144
           1       0.63      0.83      0.72       699
           2       0.40      0.04      0.07       112
           3       0.80      0.94      0.87      1087
           4       0.42      0.12      0.19        88
           5       0.62      0.14      0.23       112

    accuracy                           0.73      2242
   macro avg       0.48      0.34      0.35      2242
weighted avg       0.65      0.73      0.67      2242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
knn = KNeighborsClassifier(n_neighbors=7) 

In [32]:
knn.fit(x_train, y_train) 

In [33]:
knn_predictions = knn.predict(x_test)

In [34]:
accuracy = accuracy_score(y_test, knn_predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, knn_predictions))

Accuracy: 0.7368421052631579

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.40      0.45       144
           1       0.74      0.65      0.69       699
           2       0.33      0.27      0.30       112
           3       0.82      0.95      0.88      1087
           4       0.39      0.48      0.43        88
           5       0.56      0.29      0.38       112

    accuracy                           0.74      2242
   macro avg       0.56      0.51      0.52      2242
weighted avg       0.72      0.74      0.72      2242

