# MLOps Assigment 1
# Text Classification - **Training**

## 1. Packages Import

In [None]:
from datasets import load_dataset
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score, f1_score
# from transformers import AutoTokenizer
# import torch
# from torch.utils.data import DataLoader
# import pickle
# import nltk
# import re
# import string
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer


## 2. Data Loading

### Load the Dataset from Hugging Face using the `datasets` library

In [None]:
dataset = load_dataset('emotion')


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'][:2]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
 'label': [0, 0]}

In [None]:
dataset['test'][1996:2000]

{'text': ['im feeling a little cranky negative after this doctors appointment',
  'i feel that i am useful to my people and that gives me a great feeling of achievement',
  'im feeling more comfortable with derby i feel as though i can start to step out my shell',
  'i feel all weird when i have to meet w people i text but like dont talk face to face w'],
 'label': [3, 1, 1, 4]}

### Split the dataset into training, validation, and test sets

The data is already split into `train` and `test` sets. We'll split the train set into `train` and `validation` sets.

In [None]:
# Split the dataset into training, validation, and test sets
from sklearn.model_selection import train_test_split

X = dataset['train']['text']
y = dataset['train']['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = dataset['test']['text']
y_test = dataset['test']['label']

# Print sizes of the splits
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

train = {'text': X_train, 'labels': y_train}
val = {'text': X_val, 'labels': y_val}
test = {'text': X_test, 'labels': y_test}

Train set size: 12800
Validation set size: 3200
Test set size: 2000


## 2. Data Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Function to remove punctuations from text
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def remove_punctuation(text):
    regular_punct = string.punctuation
    #return re.sub(r'[#!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', str(text))
    return str(re.sub(r'['+regular_punct+']', '', str(text)))

# Function to remove URLs from text
def remove_urls(text):
    return re.sub(r'http[s]?://\S+', '', text)

# Function to convert the text into lower case
def lower_case(text):
    return text.lower()

# Function to lemmatize text
def lemmatize(text):
  wordnet_lemmatizer = WordNetLemmatizer()

  tokens = nltk.word_tokenize(text)
  lemma_txt = ''
  for w in tokens:
    lemma_txt = lemma_txt + wordnet_lemmatizer.lemmatize(w) + ' '

  return lemma_txt

In [None]:
import pandas as pd

# Apply preprocessing steps to 'text' column in train
series = pd.Series(train['text'])
series = series.apply(remove_urls)
series = series.apply(remove_punctuation)
series = series.apply(lower_case)
series = series.apply(lemmatize)

train['text'] = series.to_list()

In [None]:
train['text'][:2]

['i refers of course though i cant help feeling somehow ironically in retrospect to loudons son with kate mcgarrigle the rather talented himself rufus wainwright ',
 'im starting to feel that im suffering from fatigue ']

In [None]:
# Apply preprocessing steps to 'text' column in val
series = pd.Series(val['text'])
series = series.apply(remove_urls)
series = series.apply(remove_punctuation)
series = series.apply(lower_case)
series = series.apply(lemmatize)

val['text'] = series.to_list()

In [None]:
val['text'][:2]

['ive made it through a week i just feel beaten down ',
 'i feel this strategy is worthwhile ']

In [None]:
# Apply preprocessing steps to 'text' column in test
series = pd.Series(test['text'])
series = series.apply(remove_urls)
series = series.apply(remove_punctuation)
series = series.apply(lower_case)
series = series.apply(lemmatize)

test['text'] = series.to_list()

In [None]:
test['text'][:2]

['im feeling rather rotten so im not very ambitious right now ',
 'im updating my blog because i feel shitty ']

### Tokenize the text data

In [None]:
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# Tokenize the text data
def tokenize_data(data):
    return tokenizer(data['text'], return_tensors='pt', padding=True, truncation=True)

train_tokenized = tokenize_data(train)
val_tokenized = tokenize_data(val)
test_tokenized = tokenize_data(test)

In [None]:
print("train_tokenized[0] =", train_tokenized[0])
print("val_tokenized[0] =", val_tokenized[0])
print("test_tokenized[0] =", test_tokenized[0])

train_tokenized[0] = Encoding(num_tokens=94, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
val_tokenized[0] = Encoding(num_tokens=76, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
test_tokenized[0] = Encoding(num_tokens=68, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


### Convert the tokenized data into a format suitable for training

In [None]:
import torch
# Convert tokenized data into tensors
def convert_to_tensors(data):
    return {key: torch.tensor(val) for key, val in data.items()}

train_tensors = convert_to_tensors(train_tokenized)
val_tensors = convert_to_tensors(val_tokenized)
test_tensors = convert_to_tensors(test_tokenized)

  return {key: torch.tensor(val) for key, val in data.items()}


In [None]:
train_tensors

{'input_ids': tensor([[  101,   178,  4431,  ...,     0,     0,     0],
         [  101, 13280,  2547,  ...,     0,     0,     0],
         [  101,   178,  1631,  ...,     0,     0,     0],
         ...,
         [  101,   178, 20049,  ...,     0,     0,     0],
         [  101,   178,  1631,  ...,     0,     0,     0],
         [  101,   178,  1631,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
val_tensors

{'input_ids': tensor([[ 101,  178, 2707,  ...,    0,    0,    0],
         [ 101,  178, 1631,  ...,    0,    0,    0],
         [ 101,  178, 1631,  ...,    0,    0,    0],
         ...,
         [ 101,  178, 1631,  ...,    0,    0,    0],
         [ 101,  178, 7994,  ...,    0,    0,    0],
         [ 101,  178, 1631,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
test_tensors

{'input_ids': tensor([[  101, 13280,  2296,  ...,     0,     0,     0],
         [  101, 13280,  1146,  ...,     0,     0,     0],
         [  101,   178,  1309,  ...,     0,     0,     0],
         ...,
         [  101,   178,  1631,  ...,     0,     0,     0],
         [  101, 13280,  2296,  ...,     0,     0,     0],
         [  101,   178,  1631,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

### Create data loaders for the training, validation, and test sets

In [None]:
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_tensors, batch_size=32, shuffle=True)
val_loader = DataLoader(val_tensors, batch_size=32)
test_loader = DataLoader(test_tensors, batch_size=32)

## 3. Model Training

### Naive Bayes Classifier

In [None]:
print("X_train_flattened shape:", X_train_flattened.shape)
print("X_val_flattened_resized shape:", X_val_flattened_resized.shape)

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Convert PyTorch tensors to NumPy arrays
X_train_numpy = train_tensors['input_ids'].numpy()
y_train = train['labels']

X_val_numpy = val_tensors['input_ids'].numpy()
y_val = val['labels']

# Reshape the data to match Naive Bayes' input requirements
X_train_flattened = X_train_numpy.reshape(X_train_numpy.shape[0], -1)
X_val_flattened = X_val_numpy.reshape(X_val_numpy.shape[0], -1)

# Reshape the validation data to match the number of features in the training data
X_val_flattened_resized = np.pad(X_val_flattened, ((0, 0), (0, X_train_flattened.shape[1] - X_val_flattened.shape[1])), mode='constant')

# Initialize and train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_flattened, y_train)

# Predictions
y_pred_train = naive_bayes_classifier.predict(X_train_flattened)
y_pred_val = naive_bayes_classifier.predict(X_val_flattened_resized)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

print(f"Train Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")
print(f"Train F1 Score: {train_f1_score}")
print(f"Validation F1 Score: {val_f1_score}")


X_train_flattened shape: (12800, 94)
X_val_flattened_resized shape: (3200, 76)
Train Accuracy: 0.204921875
Validation Accuracy: 0.181875
Train F1 Score: 0.20738226518793815
Validation F1 Score: 0.1846568962783432


## 4. Model Evaluation

### Evaluate the trained model on the test set

In [None]:
# Convert test data to NumPy arrays
X_test_numpy = test_tensors['input_ids'].numpy()
y_test = test['labels']

# Reshape the test data to match the number of features in the training data
X_test_flattened_resized = np.pad(X_test_numpy, ((0, 0), (0, X_train_flattened.shape[1] - X_test_numpy.shape[1])), mode='constant')

# Predictions on the test set
y_pred_test = naive_bayes_classifier.predict(X_test_flattened_resized)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1_score}")


Test Accuracy: 0.174
Test F1 Score: 0.17808307651934932


## 5. Model Deployment

### Save the trained model (Pickle)

In [None]:
import pickle

# Define the file path where you want to save the trained model
model_file_path = "naive_bayes_emotion_model.pkl"

# Save the trained Naive Bayes classifier to a file
with open(model_file_path, 'wb') as file:
    pickle.dump(naive_bayes_classifier, file)

print(f"Trained model saved to {model_file_path}")

Trained model saved to naive_bayes_emotion_model.pkl
