In [29]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('train.csv',encoding = 'latin1')

In [31]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [32]:
df.isnull().sum()

Unnamed: 0,0
Class Index,0
Title,0
Description,0


In [33]:
df.duplicated().sum()

np.int64(0)

In [34]:
df['Class Index'].value_counts()

Unnamed: 0_level_0,count
Class Index,Unnamed: 1_level_1
1,20839
4,20532
2,20272
3,20210


In [35]:
df.loc[0]['Title']

'Wall St. Bears Claw Back Into the Black (Reuters)'

In [36]:
df.loc[0]['Description']

"Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [37]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [38]:
df['text'] = df['Description'] + " " + df['Title']

In [39]:
df.head()

Unnamed: 0,Class Index,Title,Description,text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","AFP - Tearaway world oil prices, toppling reco..."


In [40]:
df.drop(columns=['Title','Description'],inplace=True)

In [41]:
df.rename(columns={"Class Index":'label'},inplace=True)

In [42]:
df['label'] = df['label'] -1

In [43]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,20839
3,20532
1,20272
2,20210


In [44]:
df.head()

Unnamed: 0,label,text
0,2,"Reuters - Short-sellers, Wall Street's dwindli..."
1,2,Reuters - Private investment firm Carlyle Grou...
2,2,Reuters - Soaring crude prices plus worries\ab...
3,2,Reuters - Authorities have halted oil export\f...
4,2,"AFP - Tearaway world oil prices, toppling reco..."


In [45]:
# Preprocessing

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text
df['text'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
df.head()

Unnamed: 0,label,text
0,2,reuters shortsellers wall streets dwindlingban...
1,2,reuters private investment firm carlyle groupw...
2,2,reuters soaring crude prices plus worriesabout...
3,2,reuters authorities halted oil exportflows mai...
4,2,afp tearaway world oil prices toppling records...


In [47]:
X = df['text']
y = df['label']

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vectorizer = TfidfVectorizer()
X_train_tfidf = tf_vectorizer.fit_transform(X_train)
X_test_tfidf = tf_vectorizer.transform(X_test)

#### **Logistic Regression**

In [50]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=200)

log_model.fit(X_train_tfidf,y_train)

In [51]:
y_pred_logistic = log_model.predict(X_test_tfidf)

In [52]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print('accuracy_score: ',accuracy_score(y_test,y_pred_logistic))
print('confusion_matrix: ',confusion_matrix(y_test,y_pred_logistic))
print('classification_report: ',classification_report(y_test,y_pred_logistic))

accuracy_score:  0.914910512491601
confusion_matrix:  [[3754  130  171  131]
 [  51 3998   18   15]
 [ 130   42 3512  308]
 [ 113   38  246 3714]]
classification_report:                precision    recall  f1-score   support

           0       0.93      0.90      0.91      4186
           1       0.95      0.98      0.96      4082
           2       0.89      0.88      0.88      3992
           3       0.89      0.90      0.90      4111

    accuracy                           0.91     16371
   macro avg       0.91      0.91      0.91     16371
weighted avg       0.91      0.91      0.91     16371



---

#### **Logistic Regression CV**

In [53]:
from sklearn.model_selection import cross_val_score

cv = cross_val_score(log_model, X_train_tfidf, y_train, scoring='accuracy', cv=5, n_jobs=-1)


In [54]:
print('Cross Validation' ,cv)
print('Cross Validation' ,cv.mean())

Cross Validation [0.90852867 0.90707796 0.91310324 0.91470678 0.91004887]
Cross Validation 0.9106931031364163


---

#### **Hyperparameter Tuning**

In [55]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}


grid_search = GridSearchCV(estimator=log_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

best_log_model = grid_search.best_estimator_

test_accuracy = best_log_model.score(X_test_tfidf, y_test)
print(f"Test set accuracy: {test_accuracy:.4f}")


Best parameters: {'C': 10, 'solver': 'liblinear'}
Best CV accuracy: 0.9110138012552269
Test set accuracy: 0.9146


In [56]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_log_model.predict(X_test_tfidf)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.9146050943741982
[[3758  127  173  128]
 [  58 3988   18   18]
 [ 136   40 3507  309]
 [ 116   27  248 3720]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      4186
           1       0.95      0.98      0.97      4082
           2       0.89      0.88      0.88      3992
           3       0.89      0.90      0.90      4111

    accuracy                           0.91     16371
   macro avg       0.91      0.91      0.91     16371
weighted avg       0.91      0.91      0.91     16371



---

#### **Overfitting**

In [57]:
train_accuracy = best_log_model.score(X_train_tfidf, y_train)
test_accuracy = best_log_model.score(X_test_tfidf, y_test)


print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Difference: {train_accuracy - test_accuracy:.4f}")


Training Accuracy: 0.9736
Test Accuracy: 0.9146
Difference: 0.0590


---

In [58]:
from sklearn.naive_bayes import MultinomialNB

mnb_model = MultinomialNB()

mnb_model.fit(X_train_tfidf,y_train)

In [59]:
y_pred_mnb = mnb_model.predict(X_test_tfidf)

In [60]:
print('accuracy_score: ',accuracy_score(y_test,y_pred_mnb))
print('confusion_matrix: ',confusion_matrix(y_test,y_pred_mnb))
print('classification_report: ',classification_report(y_test,y_pred_mnb))

accuracy_score:  0.9077637285443773
confusion_matrix:  [[3745  148  196   97]
 [  41 4001   21   19]
 [ 135   41 3519  297]
 [ 152   38  325 3596]]
classification_report:                precision    recall  f1-score   support

           0       0.92      0.89      0.91      4186
           1       0.95      0.98      0.96      4082
           2       0.87      0.88      0.87      3992
           3       0.90      0.87      0.89      4111

    accuracy                           0.91     16371
   macro avg       0.91      0.91      0.91     16371
weighted avg       0.91      0.91      0.91     16371



---

#### **Cross validation MNB**

In [61]:
cv = cross_val_score(mnb_model, X_train_tfidf, y_train, scoring='accuracy', cv=5, n_jobs=-1)
print("CV scores:", cv)
print("Mean CV accuracy:", cv.mean())


CV scores: [0.90272582 0.89921356 0.90394013 0.9053146  0.90416921]
Mean CV accuracy: 0.903072664334523


#### **Hyperparameter Tuning on MNB**

In [62]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1, 5, 10]
}

grid_search_mnb = GridSearchCV(estimator=mnb_model, param_grid=param_grid,
                               cv=5, scoring='accuracy', n_jobs=-1)

grid_search_mnb.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search_mnb.best_params_)
print("Best CV accuracy:", grid_search_mnb.best_score_)

best_mnb_model = grid_search_mnb.best_estimator_

test_accuracy = best_mnb_model.score(X_test_tfidf, y_test)
print(f"Test set accuracy: {test_accuracy:.4f}")




y_pred_mnb = best_mnb_model.predict(X_test_tfidf)

print(accuracy_score(y_test, y_pred_mnb))
print(confusion_matrix(y_test, y_pred_mnb))
print(classification_report(y_test, y_pred_mnb))


Best parameters: {'alpha': 0.1}
Best CV accuracy: 0.9067835940456318
Test set accuracy: 0.9115
0.9114898295766904
[[3730  150  197  109]
 [  40 4001   22   19]
 [ 123   40 3519  310]
 [ 124   27  288 3672]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      4186
           1       0.95      0.98      0.96      4082
           2       0.87      0.88      0.88      3992
           3       0.89      0.89      0.89      4111

    accuracy                           0.91     16371
   macro avg       0.91      0.91      0.91     16371
weighted avg       0.91      0.91      0.91     16371



---

In [63]:
from sklearn.svm import LinearSVC

svc_model = LinearSVC(max_iter=10000)
svc_model.fit(X_train_tfidf, y_train)


In [64]:
y_pred_svc = svc_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svc))
print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

0.9141775090098344
[[3746  130  181  129]
 [  56 3988   18   20]
 [ 137   43 3509  303]
 [ 116   28  244 3723]]
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      4186
           1       0.95      0.98      0.96      4082
           2       0.89      0.88      0.88      3992
           3       0.89      0.91      0.90      4111

    accuracy                           0.91     16371
   macro avg       0.91      0.91      0.91     16371
weighted avg       0.91      0.91      0.91     16371



---

#### **Cross Validation on SVC**

In [65]:
cv3 = cross_val_score(svc_model, X_train_tfidf, y_train, scoring='accuracy', cv=5, n_jobs=-1)
print("CV scores:", cv3)
print("Mean CV accuracy:", cv3.mean())


CV scores: [0.91059021 0.90707796 0.91302688 0.91363775 0.9131796 ]
Mean CV accuracy: 0.9115024791055546


#### **Hyperparameter tuning on SVC**

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

svc_model = LinearSVC(max_iter=10000)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [10000, 20000]
}

grid_search_svc = GridSearchCV(estimator=svc_model, param_grid=param_grid,
                              scoring='accuracy', cv=5, n_jobs=-1)

grid_search_svc.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search_svc.best_params_)
print("Best CV accuracy:", grid_search_svc.best_score_)

best_svc_model = grid_search_svc.best_estimator_

test_accuracy = best_svc_model.score(X_test_tfidf, y_test)
print(f"Test set accuracy: {test_accuracy:.4f}")


Best parameters: {'C': 1, 'max_iter': 10000}
Best CV accuracy: 0.9115024791055546
Test set accuracy: 0.9142


---

In [None]:
#### **Stacking**

In [67]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('lr', best_log_model),
    ('mnb', best_mnb_model),
    ('svc', best_svc_model)
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

stacking_model.fit(X_train_tfidf, y_train)
y_pred_stack = stacking_model.predict(X_test_tfidf)

from sklearn.metrics import accuracy_score, classification_report
print("Stacking Accuracy:", accuracy_score(y_test, y_pred_stack))
print(classification_report(y_test, y_pred_stack))


Stacking Accuracy: 0.9199804532404863
              precision    recall  f1-score   support

           0       0.93      0.90      0.92      4186
           1       0.96      0.98      0.97      4082
           2       0.89      0.89      0.89      3992
           3       0.90      0.91      0.90      4111

    accuracy                           0.92     16371
   macro avg       0.92      0.92      0.92     16371
weighted avg       0.92      0.92      0.92     16371



In [68]:
import pickle

# Save the trained stacking model to a file
with open('stacking_model.pkl', 'wb') as file:
    pickle.dump(stacking_model, file)
