# Improving Accuracy And Evaluating Model Performance

I decided to choose the **Logistic Regression model** as my final model, because it has the shown the highest accuracy among the models tested with both CountVectorizer and TfidfVectorizer.

I wanted to increase the accuracy of the Logistic Regression model, further by incrementally increasing the number that max_feature parameter is set by 5000, starting from the max_features being set to 5000.

In [2]:
from lime import lime_text
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from datasets import load_dataset
from sklearn.ensemble import VotingClassifier
import pickle
import pandas as pd

In [3]:
train_df

Unnamed: 0,0,1,2
0,Class Index,Title,Description
1,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
2,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
3,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
4,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
...,...,...,...
119996,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119997,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119998,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119999,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [3]:
import pandas as pd

# Load training CSV
test_df2  = pd.read_csv("/kaggle/input/ag-news-topic-classification/test_data.csv", header=None)

# Load test CSV
train_df2 = pd.read_csv("/kaggle/input/ag-news-topic-classification/training_data.csv", header=None)

# Show dataset info
print("Training data shape:", train_df2.shape)
print("Testing data shape:", test_df2.shape)

# Preview first 5 rows
print(train_df2.head())

Training data shape: (120001, 2)
Testing data shape: (20, 2)
                                                   0      1
0                                               text  label
1  Wall St. Bears Claw Back Into the Black (Reute...      2
2  Carlyle Looks Toward Commercial Aerospace (Reu...      2
3  Oil and Economy Cloud Stocks' Outlook (Reuters...      2
4  Iraq Halts Oil Exports from Main Southern Pipe...      2


In [5]:
train_df2

Unnamed: 0,0,1
0,text,label
1,Wall St. Bears Claw Back Into the Black (Reute...,2
2,Carlyle Looks Toward Commercial Aerospace (Reu...,2
3,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
4,Iraq Halts Oil Exports from Main Southern Pipe...,2
...,...,...
119996,Pakistan's Musharraf Says Won't Quit as Army C...,0
119997,Renteria signing a top-shelf deal Red Sox gene...,1
119998,Saban not going to Dolphins yet The Miami Dolp...,1
119999,Today's NFL games PITTSBURGH at NY GIANTS Time...,1


In [7]:
# Remove the first row (which is the old header)
train_df2 = train_df2.drop(0).reset_index(drop=True)

# Rename columns
train_df2.columns = ["text", "label"]

# Optional: convert label to integer (just in case it's string)
train_df2["label"] = train_df2["label"].astype(int)

test_df2 = test_df2.drop(0).reset_index(drop=True)
# Rename columns
test_df2.columns = ["text", "label"]

# Optional: convert label to integer (just in case it's string)
test_df2["label"] = test_df2["label"].astype(int)


# Preview the cleaned dataset
train_df2

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2
...,...,...
119995,Pakistan's Musharraf Says Won't Quit as Army C...,0
119996,Renteria signing a top-shelf deal Red Sox gene...,1
119997,Saban not going to Dolphins yet The Miami Dolp...,1
119998,Today's NFL games PITTSBURGH at NY GIANTS Time...,1


In [8]:
test_df2

Unnamed: 0,text,label
0,European stocks rally on hope ECB may be done ...,2
1,"Gold prices have hit a new high, while the val...",2
2,Why food discount stickers may be a thing of t...,2
3,German cabinet tries to solve ‘no-debt’ crisis...,2
4,Indonesia's volcano eruption leaves 22 dead,0
5,"Adverts for Air France, Lufthansa and Etihad h...",0
6,Maori MP performs haka before swearing oath to...,0
7,Philippines earthquake: Video shows chaos in s...,0
8,Wakeskater plunges off Dubai skyscraper pool i...,0
9,Daniel Wiffen secures European Aquatic Champio...,1


In [12]:
train_df2.columns
for i in train_df2:
    print(i)

text
label


In [23]:
value = train_df2.loc[0, "text"]
value

'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'

In [9]:
from tqdm import tqdm

train_data = []
train_data_labels = []

# tqdm will show progress as you iterate
for index in tqdm(range(train_df2.shape[0]), desc="Processing rows"):
    train_data.append(train_df2.loc[index, "text"])
    train_data_labels.append(train_df2.loc[index, "label"])

    

Processing rows: 100%|██████████| 120000/120000 [00:01<00:00, 68822.24it/s]


In [13]:
len(train_data)

120000

In [27]:
train_data_labels[]

2

In [14]:
train_df2['label'].unique()

array([2, 3, 1, 0])

In [21]:
# Prepare testing data
testing_data = []
testing_data_labels = []

for index in tqdm(range(test_df2.shape[0]), desc="Processing rows"):
    testing_data.append(test_df2.loc[index, "text"])
    testing_data_labels.append(test_df2.loc[index, "label"])

# Vectorize testing data
test_features = vectorizer.transform(testing_data).toarray()

Processing rows: 100%|██████████| 19/19 [00:00<00:00, 26370.54it/s]


In [23]:
test_features.shape

(19, 5000)

### Max_features is set to 5000

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word',max_features=5000,lowercase=True,binary=True,ngram_range=(1,2),stop_words='english') 
features = vectorizer.fit_transform(train_data).toarray()

In [19]:
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
features.shape

(120000, 5000)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features,train_data_labels,train_size=0.75,random_state=123)

In [10]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
model_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
model_lr = model_lr.fit(X=X_train,y=y_train)
y_pred_lr = model_lr.predict(X_val)
print("Logistic Regression", accuracy_score(y_val,y_pred_lr))

Logistic Regression 0.9028333333333334


In [30]:
test_features[0].shape

(5000,)

In [35]:
# Predict the first test sample
prediction = model_lr.predict(test_features[3].reshape(1, -1))
print("Predicted label:", prediction)


Predicted label: [2]


In [36]:
test_features[3]

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
testing_data_labels[3]

2

In [38]:
# Label mapping
label_mapping = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Example: User enters a new topic
user_input = ["NASA launches new space telescope"]

# Convert input using the SAME training vectorizer
user_features = vectorizer.transform(user_input).toarray()

# Predict label
prediction = model_lr.predict(user_features)

print("Predicted label:", label_mapping[prediction[0]])


Predicted label: Sci/Tech


In [39]:
label_mapping = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Example topic titles to classify
examples = [
    "Global leaders meet to discuss climate crisis",      # World
    "Lionel Messi joins new football club in Miami",      # Sports
    "Apple announces record profits in Q2 earnings",      # Business
    "Researchers develop breakthrough in quantum computing"  # Sci/Tech
]

for text in examples:
    # Convert with same vectorizer used in training
    features = vectorizer.transform([text]).toarray()
    
    # Predict
    prediction = model_lr.predict(features)
    
    # Show result
    print(f"Title: {text}")
    print(f"Predicted Label: {label_mapping[prediction[0]]}")
    print("-" * 50)


Title: Global leaders meet to discuss climate crisis
Predicted Label: World
--------------------------------------------------
Title: Lionel Messi joins new football club in Miami
Predicted Label: Sports
--------------------------------------------------
Title: Apple announces record profits in Q2 earnings
Predicted Label: Sci/Tech
--------------------------------------------------
Title: Researchers develop breakthrough in quantum computing
Predicted Label: Sci/Tech
--------------------------------------------------


In [46]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [41]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [42]:
# Save model
joblib.dump(model_lr, "logistic_regression_model2.pkl")


['logistic_regression_model2.pkl']

In [43]:
import sys
import tensorflow as tf
import sklearn

print("Python version:", sys.version)
print("TensorFlow version:", tf.__version__)
print("Scikit-learn version:", sklearn.__version__)


Python version: 3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
TensorFlow version: 2.13.0
Scikit-learn version: 1.2.2


# Evaluating Model Accuracy

### Confusion Matrix, True positive, True negative values, Precision and Recall

In [44]:
cm = confusion_matrix(y_val,y_pred_lr)
print(cm)

[[6668  233  329  241]
 [  98 7294   47   55]
 [ 259   70 6516  671]
 [ 261   88  563 6607]]


In [45]:
#World
tp_1 = cm[0, 0]
fn_1 = cm[1,0] + cm[2,0] + cm[3,0]
fp_1 = cm[0,1] + cm[0,2] + cm[0,3]
tn_1 = cm[1,1] + cm[1,2] + cm[1,3] + cm[2,1] + cm[2,2] + cm[2,3] + cm[3,1] + cm[3,2] + cm[3,3]
print("True positive:", tp_1)
print("False negative:", fn_1)
print("False positive:", fp_1)
print("True negative:", tn_1)

True positive: 6668
False negative: 618
False positive: 803
True negative: 21911
