In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
import torch as t 
from torch.utils.data import TensorDataset , DataLoader 
import numpy as np
from pprint import pprint
from sklearn.metrics import classification_report , confusion_matrix


class Pipeline:
    def __init__(self , training_data , testing_data): 
        self.stop_words = stopwords.words("english")
        self.data = training_data 
        self.testing_data = testing_data
        self.model = None
        self.lemmatizer = WordNetLemmatizer()
    
    def load_clean_csv(self):
        
        assert len(self.data) != 0
        
        df = pd.read_csv(self.data)
        df = df[~df["label"].isin([4, 5])]
        
        min_sample = 1300
        df = df.groupby("label" , group_keys=False).sample(
            min_sample , 
            random_state=42 , 
            replace=True
        ) 
        
        fixed_text = []
        for sentence in df["text"]:
            review = sentence.lower()
            review = review.split()

            final = [WordNetLemmatizer().lemmatize(word) for word in review if word not in self.stop_words]
            fixed_text.append(" ".join(final))
        
        df["preprocessed_text"] = fixed_text
        
        df.drop(columns=["text"] ,inplace=True)
        
        self.data = df
        
    def train_vectorize(self):
        self.cv = CountVectorizer(ngram_range=(1,2))
        X = self.cv.fit_transform(self.data["preprocessed_text"])
        y = self.data["label"].values
        print(X.shape)
        return X, y  
    
    def test_vectorize(self):
        cv = CountVectorizer(ngram_range=(1,2))
        Y = cv.transform(self.data["preprocessed_text"])
        return Y

    def create_model(self , in_features , out_features):
        class EmotionClassifier(t.nn.Module):
            def __init__(self , infeatures , out_features):
                super().__init__()
                self.network = t.nn.Sequential(
                    t.nn.Linear(infeatures, 256),
                    t.nn.ReLU(),
                    t.nn.Dropout(0.3),
                    t.nn.Linear(256, 128),
                    t.nn.ReLU(),
                    t.nn.Dropout(0.3),
                    t.nn.Linear(128, out_features)
                )
            def forward(self , x):
                return self.network(x)
        
        return EmotionClassifier(in_features ,out_features)
        # y long
        # x float
    def train(self , epochs , batch_size ,lr=0.0001):
        self.load_clean_csv()
        xtrain , ytrain = self.train_vectorize()
        xtrain = xtrain.toarray()
        device = "cuda" if t.cuda.is_available() else "cpu"
        print("training on :" + device)
        X_tensor = t.FloatTensor(xtrain ).to(device)    
        Y_tensor = t.LongTensor(ytrain ).to(device)    
        
        dataset = TensorDataset(X_tensor , Y_tensor )
        
        train_loader = DataLoader(dataset, batch_size=batch_size , shuffle=True)
        num_classes = len(np.unique(ytrain))
        self.model = self.create_model(xtrain.shape[1] ,num_classes ).to(device)
        
        c = t.nn.CrossEntropyLoss()
        optim = t.optim.Adam(self.model.parameters() , lr=lr)
        
        all_preds = []
        all_labels = []

        for epoch in range(epochs):
            for idx , (batch_x , batch_y) in enumerate(train_loader):
                y_pred = self.model(batch_x)
                loss = c(y_pred , batch_y)

                optim.zero_grad()
                loss.backward()
                optim.step()
                

                all_preds.append(y_pred.argmax(dim=1).cpu())
                all_labels.append(batch_y.cpu())

                if idx % 50 == 0 :
                    print(f"Epoch {epoch}, Batch {idx}, Loss: {loss.item():.4f}")


        all_preds = t.cat(all_preds)
        all_labels = t.cat(all_labels)

        cr = classification_report(all_labels.numpy(), all_preds.numpy())
        cm = confusion_matrix(all_labels.numpy(), all_preds.numpy())
        print(cr)
        print(cm)
        return self.model
    
    def predict(self, text):

        review = text.lower().split()
        final = [self.lemmatizer.lemmatize(word) for word in review if word not in self.stop_words]
        clean_text = " ".join(final)
        

        X = self.cv.transform([clean_text]).toarray()

        device = "cuda" if t.cuda.is_available() else "cpu"
        X_tensor = t.FloatTensor(X).to(device)
        

        self.model.eval()
        with t.no_grad():
            output = self.model(X_tensor)
            pred = output.argmax(dim=1).item()
            probabilities = t.nn.functional.softmax(output, dim=1)
        
        return pred, probabilities.cpu().numpy()

        

In [None]:
pipeline = Pipeline("training.csv", "test.csv")
pipeline.train(epochs=5, batch_size=64)


text = "I am so angry about this!"
pred_class, probs = pipeline.predict(text)
print(f"Predicted class: {pred_class}")
print(f"Probabilities: {probs}")

(5200, 34103)
training on :cuda
Epoch 0, Batch 0, Loss: 1.3855
Epoch 0, Batch 50, Loss: 1.3819
Epoch 1, Batch 0, Loss: 1.3714
Epoch 1, Batch 50, Loss: 1.3434
Epoch 2, Batch 0, Loss: 1.2656
Epoch 2, Batch 50, Loss: 1.1063
Epoch 3, Batch 0, Loss: 0.9062
Epoch 3, Batch 50, Loss: 0.6483
Epoch 4, Batch 0, Loss: 0.4721
Epoch 4, Batch 50, Loss: 0.3277
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      6500
           1       0.63      0.89      0.74      6500
           2       0.89      0.80      0.84      6500
           3       0.99      0.64      0.78      6500

    accuracy                           0.78     26000
   macro avg       0.82      0.78      0.79     26000
weighted avg       0.82      0.78      0.79     26000

[[5200 1202   80   18]
 [ 411 5795  285    9]
 [ 264 1031 5205    0]
 [ 830 1208  277 4185]]
Predicted class: 3
Probabilities: [[0.22413659 0.13713335 0.15775715 0.480973  ]]
