In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [2]:
train_df = pd.read_csv("../Task_2/preprocessed_training_data.csv")
test_df = pd.read_csv("../Task_2/preprocessed_testing_data.csv")

In [3]:
test_df["Category"].value_counts()

Category
earn            1087
acq              699
crude            144
interest         112
money-fx         112
trade             88
ship              69
wheat             38
sugar             31
money-supply      30
Name: count, dtype: int64

In [4]:
top6_cat = train_df["Category"].value_counts().head(6).index.tolist()

In [5]:
top6_cat

['earn', 'acq', 'interest', 'crude', 'trade', 'money-fx']

In [6]:
train_df = train_df[train_df['Category'].isin(top6_cat)]
test_df = test_df[test_df['Category'].isin(top6_cat)]

In [7]:
train_x, valid_x, train_y, valid_y = train_test_split(train_df["Text"], train_df['Category'], test_size=0.2)

In [8]:
test_x = test_df['Text']

In [9]:
test_y = test_df['Category']

In [10]:

encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
valid_y = encoder.fit_transform(valid_y)

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [12]:
model = xgb.XGBClassifier()

In [13]:
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),  
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

In [14]:
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=5,  # Number of parameter settings that are sampled
    scoring='accuracy',  # Choose the appropriate scoring metric
    cv=3,  # Number of cross-validation folds
    verbose=10, # Controls the verbosity: the higher, the more messages
)

In [None]:
random_search.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10)


In [16]:
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Get the best estimator model
best_model = random_search.best_estimator_

Best Parameters: {'colsample_bytree': 0.5062764309468029, 'learning_rate': 0.10878711003509822, 'max_depth': 4, 'n_estimators': 853, 'subsample': 0.7994457831473215}


In [20]:
model = xgb.XGBClassifier(random_state=42, colsample_bytree=0.5062764309468029, learning_rate=0.10878711003509822, max_depth=4, n_estimators=853, subsample=0.7994457831473215)

In [21]:
model.fit(xtrain_tfidf, train_y, eval_set=[(xvalid_tfidf, valid_y)], early_stopping_rounds=10, verbose = True)

[0]	validation_0-mlogloss:1.58675




[1]	validation_0-mlogloss:1.42016
[2]	validation_0-mlogloss:1.28954
[3]	validation_0-mlogloss:1.18294
[4]	validation_0-mlogloss:1.08512
[5]	validation_0-mlogloss:1.00105
[6]	validation_0-mlogloss:0.93083
[7]	validation_0-mlogloss:0.86651
[8]	validation_0-mlogloss:0.81015
[9]	validation_0-mlogloss:0.75825
[10]	validation_0-mlogloss:0.71473
[11]	validation_0-mlogloss:0.67252
[12]	validation_0-mlogloss:0.63593
[13]	validation_0-mlogloss:0.60263
[14]	validation_0-mlogloss:0.57187
[15]	validation_0-mlogloss:0.54400
[16]	validation_0-mlogloss:0.51911
[17]	validation_0-mlogloss:0.49775
[18]	validation_0-mlogloss:0.47497
[19]	validation_0-mlogloss:0.45564
[20]	validation_0-mlogloss:0.43852
[21]	validation_0-mlogloss:0.42022
[22]	validation_0-mlogloss:0.40452
[23]	validation_0-mlogloss:0.39006
[24]	validation_0-mlogloss:0.37638
[25]	validation_0-mlogloss:0.36437
[26]	validation_0-mlogloss:0.35274
[27]	validation_0-mlogloss:0.34302
[28]	validation_0-mlogloss:0.33386
[29]	validation_0-mlogloss:0.

In [22]:
predictions = model.predict(xtest_tfidf)

In [23]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping

{'acq': 0, 'crude': 1, 'earn': 2, 'interest': 3, 'money-fx': 4, 'trade': 5}

In [24]:
decoded_labels = encoder.inverse_transform(test_y)

In [25]:
accuracy = accuracy_score(test_y, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.951828724353256

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       699
           1       0.92      0.93      0.93       144
           2       0.99      0.98      0.98      1087
           3       0.82      0.77      0.79       112
           4       0.78      0.73      0.76       112
           5       0.94      0.94      0.94        88

    accuracy                           0.95      2242
   macro avg       0.90      0.89      0.89      2242
weighted avg       0.95      0.95      0.95      2242

