In [82]:
!pip install gensim==4.3.2
!pip install mlflow==2.11.1



In [83]:
!python --version

Python 3.10.14


In [84]:
import pandas as pd
import torch as torch

In [80]:
!aws s3 ls

2024-03-08 16:36:55 aws-glue-assets-499751399505-ca-central-1
2023-07-16 19:22:33 madhav-krishna-demo
2024-01-31 04:07:29 mini-pipeline-project
2024-04-11 00:49:21 sagemaker-ca-central-1-499751399505
2024-04-11 00:49:18 sagemaker-studio-499751399505-l6sukv6beie
2024-03-04 03:48:55 wcd-project-twitter-datasets


In [59]:
import boto3

S3_BUCKET = os.environ.get("S3_BUCKET")
s3 = boto3.client('s3')
s3.download_file(S3_BUCKET, 'data/training/twitter_training.csv', 'twitter_training.csv')
s3.download_file(S3_BUCKET, 'data/test/test-training.csv', 'twitter_validation.csv')


training_df = pd.read_csv('twitter_training.csv', names = ["id", "account", "label", "text"])
test_df = pd.read_csv('twitter_validation.csv', names = ["id", "account", "label", "text"])

training_df.pop('account')
training_df.head()

Unnamed: 0,id,label,text
0,2401,Positive,im getting on borderlands and i will murder yo...
1,2401,Positive,I am coming to the borders and I will kill you...
2,2401,Positive,im getting on borderlands and i will kill you ...
3,2401,Positive,im coming on borderlands and i will murder you...
4,2401,Positive,im getting on borderlands 2 and i will murder ...


In [60]:
#Data Cleaning
print(len(training_df[training_df["text"].isnull()]))
print(len(training_df[training_df["label"].isnull()]))
print(len(test_df[test_df["text"].isnull()]))
print(len(test_df[test_df["label"].isnull()]))

training_df['text'].fillna('', inplace=True)
training_df['label'].fillna('Neutral', inplace=True)
test_df['text'].fillna('', inplace=True)
test_df['label'].fillna('Neutral', inplace=True)

training_df = training_df.groupby(['id', 'label']).agg({'text': ' '.join}).reset_index()
test_df = test_df.groupby(['id', 'label']).agg({'text': ' '.join}).reset_index()

print(len(training_df[training_df["label"] == "Positive"]), \
      len(training_df[training_df["label"] == "Negative"]), \
      len(training_df[training_df["label"] == "Neutral"]))

686
0
0
0
3472 3757 3053


In [61]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Positive = [training_df["label"] == "Positive", test_df["label"] == "Positive"]
Negative = [training_df["label"] == "Negative", test_df["label"] == "Negative"]
Neutral = [training_df["label"] == "Neutral", test_df["label"] == "Neutral"]

training_df["Positive"], training_df["Negative"], training_df["Neutral"] = \
Positive[0], Negative[0], Neutral[0]

test_df["Positive"], test_df["Negative"], test_df["Neutral"] = \
Positive[1], Negative[1], Neutral[1]

training_train_df, training_val_df = train_test_split(training_df, train_size \
                                                      = 0.8, stratify=training_df["label"])
print(len(training_df))
print(len(training_train_df))
print(len(training_train_df[training_train_df['label'] == 'Positive']), \
      len(training_train_df[training_train_df['label'] == 'Negative']), \
      len(training_train_df[training_train_df['label'] == 'Neutral']))

class Texts(Dataset):
  def __init__(self, df):
    self.df = df
  def __len__(self):
    return len(self.df)
  def __getitem__(self, index):
    text = self.df.iloc[index].at['text']
    positive = self.df.iloc[index].at['Positive']
    negative = self.df.iloc[index].at['Negative']
    neutral = self.df.iloc[index].at['Neutral']
    label = torch.tensor([int(positive),\
                          int(negative),\
                          int(neutral)])
    return (text, label)

training_dataset = Texts(training_train_df)
validation_dataset = Texts(training_val_df)
test_dataset = Texts(test_df)

BATCH_SIZE = 8


train_dataloaders = {'train': DataLoader(training_dataset, batch_size = BATCH_SIZE),
                     'val': DataLoader(validation_dataset, batch_size=BATCH_SIZE)}
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


12447
9957
2778 3005 2442


In [62]:

from numpy import mean
import numpy as np
from gensim.utils import simple_preprocess
import torch
import torch.nn as nn
from torch import Tensor
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score


def training_loop(dataloader, model1, model2, epochs, loss_function, optimizer, scheduler = None):
  #model1 is the doc2vec and model2 is the softmax layer, its assumed that
  #model1 is trained
  epoch_c_matrices = []
  epoch_losses = []
  for epoch in range(epochs):
    print("Epoch: " + str(epoch))
    epoch_c_matrix = np.zeros((3,3))
    batch_losses = []
    for x in ['train', 'val']:
      model2.to(device)
      if x == 'train':
        model2.train()
      else:
        model2.eval()
      for (data, label) in dataloader[x]:
        data = list(data)
        label = label.to(torch.float32).to(device)
        data = Tensor(np.asarray([model1.infer_vector(simple_preprocess(text)) for text in data]))
        data.to(device)
        output = model2(data)
        ###
        output.to(device)
        loss = loss_function(output, label)
        optimizer.zero_grad()
        with torch.no_grad():
          predictions = torch.argmax(output.to('cpu'), 1)
          label = torch.argmax(label.to('cpu'), 1)

        if x == 'train':
          loss.backward()
          optimizer.step()
        else:
          c_matrix = confusion_matrix(label, predictions, labels=[0,1,2])
          epoch_c_matrix += c_matrix
          batch_losses.append(loss.item())


    epoch_loss = mean(batch_losses)
    epoch_losses.append(epoch_loss)

    epoch_c_matrices.append(epoch_c_matrix)
    print("The accuracy is: ")
    print((epoch_c_matrix[0][0] + epoch_c_matrix[1][1] + epoch_c_matrix[2][2])/np.sum(epoch_c_matrix))
#     print("current epoch c matrix:")
#     print(epoch_c_matrices[-1])

    if scheduler:
        scheduler.step(epoch_loss)

  return epoch_c_matrices


In [85]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument

def create_documents(corpus, tokens_only= False):
  new_corpus = []
  for i, text in enumerate(corpus):
    tokens = simple_preprocess(text)
    if not tokens_only:
      new_corpus.append(TaggedDocument(tokens, [i]))
    else:
      new_corpus.append(tokens)
  return new_corpus

train_corpus = create_documents(training_train_df['text'])
test_corpus = create_documents(test_df['text'], tokens_only = True)
print(train_corpus[0])
print(test_corpus[0])

TaggedDocument<['the', 'most', 'boring', 'game', 've', 'ever', 'played', 'the', 'most', 'boring', 'game', 've', 'ever', 'played', 'the', 'most', 'boring', 'game', 've', 'ever', 'played', 'the', 'most', 'violent', 'game', 've', 'ever', 'played', 'the', 'most', 'boring', 'single', 'game', 've', 'ever', 'played', 'the', 'most', 'boring', 'game', 've', 'yet', 'played'], [0]>
['gamespot', 'borderlands', 'dlc', 'dismisses', 'major', 'problems', 'with', 'lovecraft', 'work', 'ift', 'tt', 'qpeis', 'gamespot', 'borderlands', 'dlc', 'rejects', 'major', 'issues', 'with', 'lovecraft', 'work', 'ift', 'tt', 'qpeis', 'gamespot', 'dlc', 'borderlands', 'fires', 'protagonists', 'over', 'lovecraft', 'tt', 'qs', 'gamespot', 'borderlands', 'dlc', 'dismisses', 'major', 'problems', 'lovecraft', 'dream', 'ift', 'tt', 'qpeis', 'official', 'gamespot', 'lost', 'borderlands', 'dlc', 'dismisses', 'major', 'problems', 'running', 'with', 'lovecraft', 'underground', 'work', 'as', 'ift', 'tt', 'qpeis', 'gamespot', 'fan

In [64]:

def validate(test_dataloader, model1, model2):
    batch_c_matrix = np.zeros((3,3))
    for (data, label) in test_dataloader:
        data = list(data)
        label = label.to(torch.float32).to(device)
        data = Tensor(np.asarray([model1.infer_vector(simple_preprocess(text)) for text in data]))
        data.to(device)
        output = model2(data)
        ###
        output.to(device)
        optimizer.zero_grad()
        with torch.no_grad():
          predictions = torch.argmax(output.to('cpu'), 1)
          label = torch.argmax(label.to('cpu'), 1)
          c_matrix = confusion_matrix(label, predictions, labels=[0,1,2])
          batch_c_matrix += c_matrix
    return batch_c_matrix

def accuracy(batch_matrix):
    a = (batch_matrix[0][0] + batch_matrix[1][1] + batch_matrix[2][2])/np.sum(batch_matrix)
    return a
        

In [65]:
import mlflow

class Doc2VecWrapper(mlflow.pyfunc.PythonModel):
    
    def load_context(self, context):
        from gensim.models.doc2vec import Doc2Vec
        doc2vec_file_path = context.artifacts['doc2vec_file_path']
        self.model = Doc2Vec.load(doc2vec_file_path)
        
    def __init__(self):
        super(Doc2VecWrapper, self).__init__()
        self.model = None
        
    def predict(self, context, model_input):
        return self.model.infer_vector(model_input)

In [86]:
#Below is subject to change
TRACKING_SERVER_HOST = os.environ.get("TRACKING_SERVER_HOST")
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")
mlflow.search_experiments()

tracking URI: 'http://ec2-99-79-55-202.ca-central-1.compute.amazonaws.com:5000'


[<Experiment: artifact_location='s3://wcd-project-twitter-datasets/artifacts/2', creation_time=1714524225208, experiment_id='2', last_update_time=1714524225208, lifecycle_stage='active', name='doc2vec experiment 2', tags={}>,
 <Experiment: artifact_location='s3://wcd-project-twitter-datasets/artifacts/1', creation_time=1713313446007, experiment_id='1', last_update_time=1713313446007, lifecycle_stage='active', name='doc2vec experiment', tags={}>,
 <Experiment: artifact_location='s3://wcd-project-twitter-datasets/artifacts/0', creation_time=1712889391161, experiment_id='0', last_update_time=1712889391161, lifecycle_stage='active', name='Default', tags={}>]

In [67]:

import os
import torch.optim as optim

VECTOR_SIZE = 10
NUM_CLASSES = 3

lr = 5e-4
dropout = 0.3
epochs = 1

mlflow.set_experiment("doc2vec experiment 2")

with mlflow.start_run() as run:
    
    run_id = run.info.run_id
    params = {'VECTOR_SIZE':VECTOR_SIZE, 'NUM_CLASSES':NUM_CLASSES, 'lr':lr, 'dropout':dropout,\
          'BATCH_SIZE':BATCH_SIZE, 'epochs':epochs}

    model1 = gensim.models.doc2vec.Doc2Vec(vector_size=VECTOR_SIZE, min_count=2, epochs=40)
    model1.build_vocab(train_corpus)

    model1.train(train_corpus, total_examples=model1.corpus_count, epochs=model1.epochs)
    model2 = nn.Sequential(nn.Linear(VECTOR_SIZE, NUM_CLASSES), nn.Softmax(dim = 1))

    loss_function = nn.BCELoss()
    optimizer = optim.SGD(model2.parameters(), lr = lr, weight_decay=5e-4, momentum=0.9)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.6)
    args = training_loop(train_dataloaders, model1, model2, epochs, loss_function, optimizer, scheduler)
    
    model1.save('doc2vec.pkl')
    mlflow.pyfunc.log_model(artifact_path = "doc2vec_doc_embedding",python_model= Doc2VecWrapper(), \
                            artifacts = {'doc2vec_file_path': 'doc2vec.pkl'}, pip_requirements=["gensim==4.3.2"])
    mlflow.pytorch.log_model(model2, artifact_path="doc2vec_classification")
    #logging the model accuracy
    validation = validate(train_dataloaders['val'], model1, model2)
    mlflow.log_metric("accuracy", accuracy(validation))

mlflow.end_run()

Epoch: 0
The accuracy is: 
0.5413654618473895


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

 - mlflow (current: 2.12.1, required: mlflow==2.11.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.12.1, required: mlflow==2.11.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [74]:


#Register the best model
highest_accuracy_run_id = mlflow.search_runs(
    experiment_names=["doc2vec experiment 2"],
    max_results=1,
    order_by=["metrics.accuracy DESC"],
)['run_id'][0]

print(highest_accuracy_run_id)

embedding = mlflow.register_model(f"runs:/{highest_accuracy_run_id}/doc2vec_doc_embedding", "doc2vec_doc_embedding")

classification = mlflow.register_model(f"runs:/{highest_accuracy_run_id}/doc2vec_classification", "doc2vec_classification")

Registered model 'doc2vec_doc_embedding' already exists. Creating a new version of this model...
2024/05/01 01:08:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: doc2vec_doc_embedding, version 3
Created version '3' of model 'doc2vec_doc_embedding'.
Registered model 'doc2vec_classification' already exists. Creating a new version of this model...
2024/05/01 01:08:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: doc2vec_classification, version 3


b9036f41898546d5830181307ad80978


Created version '3' of model 'doc2vec_classification'.


In [87]:
#Simple test of model regitering
model_version = 3
embedding_model = mlflow.pyfunc.load_model(model_uri=f"models:/doc2vec_doc_embedding/{model_version}")
classification_model = mlflow.pytorch.load_model(model_uri=f"models:/doc2vec_classification/{model_version}")
print(classification_model)

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2024/05/05 23:41:42 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
 - mlflow (current: 2.12.1, required: mlflow==2.11.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/05 23:41:44 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Sequential(
  (0): Linear(in_features=10, out_features=3, bias=True)
  (1): Softmax(dim=1)
)
tensor([[-0.2221, -0.5706,  0.2081, -0.1777,  0.1867, -0.0440,  0.2745, -0.0332,
         -0.5207,  0.1626]])
tensor([[0.3635, 0.3695, 0.2670]], grad_fn=<SoftmaxBackward0>)


In [98]:
tweet = "this is a random tweet"
data = Tensor(embedding_model.predict(data=simple_preprocess(tweet)))
data = data.unsqueeze(0)
print(data)
classification_model.eval()
with torch.no_grad():
    output = classification_model(data)
print(output)

tensor([[-0.1331, -0.4910,  0.2949, -0.1147,  0.1475, -0.0035,  0.2693, -0.0062,
         -0.3912,  0.1886]])
tensor([[0.3652, 0.3627, 0.2721]])
