In [102]:
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [103]:
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
train_df = pd.read_csv(r'D:\AG_News\ML-Lab-AGNews\train\train.csv',delimiter=',')
test_df  = pd.read_csv(r'D:\AG_News\ML-Lab-AGNews\test\test.csv',delimiter=',')

train_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [105]:
train_df_balanced = (
    train_df
    .groupby('Class Index', group_keys=False)
    .apply(lambda x: x.sample(n=7500, random_state=42))
    .reset_index(drop=True)
)

test_df_balanced = (
    test_df
    .groupby('Class Index', group_keys=False)
    .apply(lambda x: x.sample(n=1625, random_state=42))
    .reset_index(drop=True)
)

  .apply(lambda x: x.sample(n=7500, random_state=42))
  .apply(lambda x: x.sample(n=1625, random_state=42))


In [106]:
train_df_balanced['Class Index'] = train_df_balanced['Class Index'] - 1
test_df_balanced['Class Index'] = test_df_balanced['Class Index'] - 1

In [107]:
train_df_balanced['text'] = train_df_balanced['Title'] + ' ' + train_df_balanced['Description']
test_df_balanced['text']  = test_df_balanced['Title']  + ' ' + test_df_balanced['Description']

train_df_balanced = train_df_balanced[["Class Index","text"]]
test_df_balanced = test_df_balanced[["Class Index","text"]]

rows, cols = train_df_balanced.shape
print(f"Training set:\n")
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")
print("\nClass distribution in target column:")
print(train_df_balanced['Class Index'].value_counts())

rows2, cols2 = test_df_balanced.shape
print(f"Test set:\n")
print(f"Number of rows: {rows2}")
print(f"Number of columns: {cols2}")
print("\nClass distribution in target column:")
print(test_df_balanced['Class Index'].value_counts())


Training set:

Number of rows: 30000
Number of columns: 2

Class distribution in target column:
Class Index
0    7500
1    7500
2    7500
3    7500
Name: count, dtype: int64
Test set:

Number of rows: 6500
Number of columns: 2

Class distribution in target column:
Class Index
0    1625
1    1625
2    1625
3    1625
Name: count, dtype: int64


In [108]:
def clean_text(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation (you can refine if you want to keep some symbols)
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS]
    # Join back to a single string
    cleaned = " ".join(tokens)
    return cleaned

In [109]:
tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        preprocessor=clean_text,
        max_features=25000,    # Reduced slightly but still substantial
        ngram_range=(1, 2),    # Keep bigrams for better accuracy
        min_df=3,              
        max_df=0.9,            
        use_idf=True,
        sublinear_tf=True
    ))
])

In [110]:
X_train = train_df_balanced['text']
y_train = train_df_balanced['Class Index']

X_test = test_df_balanced['text']
y_test = test_df_balanced['Class Index']


print("Unique class labels:", np.unique(y_train))
print("Min label:", np.min(y_train))
print("Max label:", np.max(y_train))


Unique class labels: [0 1 2 3]
Min label: 0
Max label: 3


In [111]:
tfidf_pipeline.fit(X_train)

In [112]:
X_train_tfidf = tfidf_pipeline.transform(X_train)
X_test_tfidf  = tfidf_pipeline.transform(X_test)

In [113]:
print("Training shape:", X_train_tfidf.shape)
print("Test shape:", X_test_tfidf.shape)

Training shape: (30000, 25000)
Test shape: (6500, 25000)


In [114]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset


In [115]:
X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [116]:
class TextCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=5)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        # Calculate the size of the flattened features after conv and pooling
        # For a 1D CNN, we need to calculate how many features will be output
        # after convolution and pooling operations
        
        # After convolution: (input_dim - kernel_size + 1)
        # After pooling: ((input_dim - kernel_size + 1) // kernel_size_of_pool)
        flattened_features = ((input_dim - 5 + 1) // 2) * 100
        
        self.fc = nn.Linear(flattened_features, num_classes)

    def forward(self, x):
        # Change input shape from [batch_size, features] to [batch_size, channels, features]
        # For 1D convolution in PyTorch, input should be [batch_size, channels, sequence_length]
        x = x.unsqueeze(1)  # Add channel dimension for Conv1D
        
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        
        # Flatten all dimensions except the batch dimension
        x = x.view(x.size(0), -1)
        
        x = self.fc(x)
        return x

# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextCNN(input_dim=X_train_tfidf.shape[1], num_classes=len(y_train.unique())).to(device)



In [117]:
# Define optimizer and criterion (which were missing in your code)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [118]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'

# Set GPU memory limit
if torch.cuda.is_available():
    # Limit to 3.5GB
    torch.cuda.set_per_process_memory_fraction(3.5/4.0)

In [None]:
num_epochs = 8
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

torch.save(model.state_dict(), "text_cnn2_model.pth")
print("Model saved successfully!")

Epoch [1/6], Loss: 8438.2728
Epoch [2/6], Loss: 348.8088
Epoch [3/6], Loss: 231.4194
Epoch [4/6], Loss: 135.8773
Epoch [5/6], Loss: 91.3069
Epoch [6/6], Loss: 51.0065
Model saved successfully!


In [121]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch

def evaluate_model_with_confusion_matrix(model, train_loader, test_loader, device, class_names=None):
    model.eval()

    def get_predictions(loader, dataset_name="Dataset"):
        all_predictions = []
        all_true_labels = []
        
        with torch.no_grad():
            for X_batch, y_batch in loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                # Get model outputs
                outputs = model(X_batch)

                # Get predicted class indices
                _, predicted = torch.max(outputs, 1)

                # Append batch predictions and true labels to lists
                all_predictions.extend(predicted.cpu().numpy())
                all_true_labels.extend(y_batch.cpu().numpy())

        # Calculate accuracy
        accuracy = 100 * np.sum(np.array(all_predictions) == np.array(all_true_labels)) / len(all_true_labels)
        print(f"\n{dataset_name} Accuracy: {accuracy:.2f}%")

        # Compute confusion matrix
        cm = confusion_matrix(all_true_labels, all_predictions)

        # Print classification report
        if class_names is not None:
            print(f"\nClassification Report for {dataset_name}:")
            print(classification_report(all_true_labels, all_predictions, target_names=class_names))
        else:
            print(f"\nClassification Report for {dataset_name}:")
            print(classification_report(all_true_labels, all_predictions))

        return all_true_labels, all_predictions, cm

    # Evaluate on Training Set
    print("\n================ Training Set Evaluation ================")
    train_labels, train_preds, train_cm = get_predictions(train_loader, "Training Set")

    # Evaluate on Test Set
    print("\n================ Test Set Evaluation ================")
    test_labels, test_preds, test_cm = get_predictions(test_loader, "Test Set")

    # Plot Confusion Matrix for Test Set
    plt.figure(figsize=(10, 8))
    sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix - Test Set')
    plt.tight_layout()
    plt.savefig('test_confusion_matrix.png')
    plt.close()

    print("Confusion matrix for test set saved as 'test_confusion_matrix.png'")

    return train_labels, train_preds, train_cm, test_labels, test_preds, test_cm

# AG News class names
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

# Run the evaluation
train_labels, train_preds, train_cm, test_labels, test_preds, test_cm = evaluate_model_with_confusion_matrix(
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    device=device,
    class_names=class_names
)




Training Set Accuracy: 98.53%

Classification Report for Training Set:
              precision    recall  f1-score   support

       World       1.00      0.97      0.98      7500
      Sports       0.99      1.00      0.99      7500
    Business       0.98      0.98      0.98      7500
    Sci/Tech       0.98      0.99      0.98      7500

    accuracy                           0.99     30000
   macro avg       0.99      0.99      0.99     30000
weighted avg       0.99      0.99      0.99     30000



Test Set Accuracy: 89.15%

Classification Report for Test Set:
              precision    recall  f1-score   support

       World       0.92      0.87      0.90      1625
      Sports       0.94      0.97      0.95      1625
    Business       0.84      0.87      0.86      1625
    Sci/Tech       0.86      0.86      0.86      1625

    accuracy                           0.89      6500
   macro avg       0.89      0.89      0.89      6500
weighted avg       0.89      0.89      0.89    