In [55]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import torch
from torch.nn.modules import Sequential
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [56]:
labels={0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}

In [57]:
def generate_word_cloud(df, label):
    
    text = " ".join(review for review in df[df.label == label]['text'])
    stop_words = set(stopwords.words('english'))
  
    wordcloud = WordCloud(
        stopwords=stop_words,
        background_color="white",
        width=800,
        height=400,
        colormap='viridis'
    ).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Word Cloud for "{labels[label].title()}"', fontsize=16)
    plt.show()


In [58]:
def one_hot_encode(x):
    labels = np.array(x)
    onehot = np.zeros((len(labels), len(set(labels))))
    onehot[np.arange(len(labels)), labels] = 1
    return onehot

In [59]:
def embed_with_sentence_transformers(sentences, model_name='all-MiniLM-L6-v2'):
    print(f"\n--- Generating embeddings with 'sentence-transformers' library ({model_name}) ---")
    model = SentenceTransformer(model_name)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    print("Embedding generation complete.")
    return sentence_embeddings.cpu()

In [60]:
def remove_stopwords(sentence):
    tokenizer = RegexpTokenizer(r'\w+')  # Matches one or more alphanumeric characters or underscores
    tokens = tokenizer.tokenize(sentence)
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in english_stopwords]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [61]:
def preprocessing(data):
    sentences = list(data['text'])
    print("Lowering sentences...")
    lowered_sentences = list(map(lambda X: X.lower(), sentences))
    print("Cleaning sentences...")
    cleaned_sentences = list(map(remove_stopwords, lowered_sentences))
    print("Tokenizing sentences...")
    embeddings = embed_with_sentence_transformers(cleaned_sentences)
    print("Embedding generation complete.")
    return np.array(embeddings)

In [62]:
data = pd.read_csv("C:\\Users\\Kagero\\PycharmProjects\\elevvo\\AG News\\train.csv")
data.info()
print(set(data['label']))

In [63]:
data.head()

In [64]:
test_data = pd.read_csv("C:\\Users\\Kagero\\PycharmProjects\\elevvo\\AG News\\test.csv")
test_data.info()

In [65]:
test_data.head()

In [66]:
Y = np.array(data['label'])

In [67]:
X = np.load("X.npy")

In [68]:
#X= preprocessing(data)
#np.save("X",X)

In [69]:
print(X.shape)
print(Y.shape)

In [70]:
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.3, random_state=42)

In [71]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

In [72]:
x_test = preprocessing(test_data)

In [73]:
y_test = test_data['label']

In [74]:
print(x_test.shape)
print(y_test.shape)

In [75]:
log_reg = LogisticRegression(max_iter=500, solver='newton-cg', random_state=42, penalty="l2")
log_reg.fit(x_train, y_train)

In [76]:
print(classification_report(log_reg.predict(x_val), y_val))

In [77]:
print(classification_report(log_reg.predict(x_test), y_test))

In [78]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x,dtype=torch.float)
        self.y = torch.tensor(y,dtype=torch.float)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [79]:
model = Sequential(
    torch.nn.Linear(384, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 4),
    torch.nn.Softmax(dim=1)
)

In [80]:
y_train=one_hot_encode(y_train)

In [81]:
data_loader = DataLoader(CustomDataset(x_train, y_train), batch_size=256, shuffle=True)

In [82]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [83]:
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()
epochs=20

In [84]:
model.to(device)

In [85]:
summary(model, (64, 384))

In [86]:
for epoch in range(epochs):
    progress_bar = tqdm(
                    data_loader,
                    desc=f'Epoch {epoch + 1}/{epochs}',
                    leave=False,
                    ncols=100
                )
    
    for x,y in progress_bar:
        x=x.to(device)
        y=y.to(device)
        output = model(x)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})
        

In [87]:
model.eval()

In [88]:
val_predictions=model(torch.tensor(x_val, dtype=torch.float).to(device))
print(classification_report(np.argmax(np.array(val_predictions.cpu().detach().numpy()), axis=1),y_val ))

In [89]:
test_predictions=model(torch.tensor(x_test,dtype=torch.float).to(device))
print(classification_report(np.argmax(np.array(test_predictions.cpu().detach().numpy()), axis=1), y_test))

In [90]:
for i in set(data['label']):
    generate_word_cloud(data, i)