# MT5 Model Training and Testing

In [4]:
# !python.exe -m pip install --upgrade pip --user
# !pip install transformers --user
# !pip install datasets --user
# !pip install torch --user
# !pip install scikit-learn --user
# !pip install sentencepiece --user
# !pip install transformers[torch] --user

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 1.8/1.8 MB 4.3 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-24.3.1








# Setting up the Data

In [2]:
import pandas as pd
from transformers import Trainer, TrainingArguments, MT5Tokenizer, MT5ForConditionalGeneration
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

In [3]:
train_df = pd.read_csv('C:\PycharmProjects\CS 534\mt5Model\\tweet-sentiment-extraction\\train.csv')
test_df = pd.read_csv('C:\PycharmProjects\CS 534\mt5Model\\tweet-sentiment-extraction\\test.csv')

In [4]:
from sklearn.model_selection import train_test_split

# Define the input and output
train_df['input_text'] = train_df['text']
train_df['output_text'] = train_df['sentiment']

# Split the training data into sections for training the model and sections for validating the model
# The current setup has 90% of the data being used as training data and 10% of the data being used to validate the model
# X_train is the training data input values and x_val is the validation data input values
# y_train is the training data output values and y_val is the validation data output values
X_train, X_val, y_train, y_val = train_test_split(
    train_df['input_text'],
    train_df['output_text'],
    test_size=0.1,
    random_state=42
)

In [5]:


tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

def encode_data(texts):
    if isinstance(texts, pd.Series):  # If input is a Pandas Series
        texts = texts.tolist()  # Convert to list
    elif isinstance(texts, str):  # If input is a single string
        texts = [texts]  # Make it a list
    else:
        # If it gets here, then there is a massive error, but just want to make sure
        print("\n\n\nHuuuuuge Error Here\n\n\n")
        print(type(texts))

    # Ensure all elements are strings and handle missing values
    # set text to a string, and if the string has values then do nothing, otherwise, set the string spots 
    #to "NA" for every spot in the string
    for i in range(len(texts)):
        if pd.notna(texts[i]):
            texts[i] = str(texts[i])
        else:
            texts[i] = "NA"
    # texts = [str(text) if pd.notna(text) else "NA" for text in texts]

    # This code converts the whole text thing into tensors (torch lists of same size) which are 
    #number the number conversions of each word in a string
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=50,
        return_tensors='pt'
    )

#tokenize the training data and the validating data
train_encodings = encode_data(X_train)
val_encodings = encode_data(X_val)

print(type(X_train))  # Should be a Pandas Series
print(X_train)  # Should show some strings

print("\n\n")

print(y_train)

#test_encodings = encode_data(test_df['text'])

<class 'pandas.core.series.Series'>
14619    WTF facebook just cleared out my whole survey ...
25779    Back from LAAANDAN.  Miss it already   check o...
6138      i feel like tweeting you for no reason. so um...
17428    Bank Holiday Brunch.  With all the fixin`s.  A...
18638           Tired with a headache  me no like sunshine
                               ...                        
21575    STAR TREK WAS PURE AWESOME! LOVE IT!!! <3333  ...
5390     Will be going to Indiana Baptist Sunday, Pray ...
860      is sitting thru the boring bits in Titanic wai...
15795                                      Missed the play
23654    Oh I`m really tired of these migraines! #Endom...
Name: input_text, Length: 24732, dtype: object



14619    positive
25779    negative
6138     negative
17428    positive
18638    negative
           ...   
21575    positive
5390      neutral
860       neutral
15795    negative
23654    negative
Name: output_text, Length: 24732, dtype: object


In [6]:


# class for processing tweet datasets and turning it into usable data (Tokenize)
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.encodings.items()}

        # Tokenize labels separately and ensure they are padded
        label_encoding = tokenizer(
            self.labels[index],
            padding='max_length',  # Pad to max length
            truncation=True,
            max_length=50,  # You can set this to the maximum length you expect
            return_tensors='pt'
        )

        item['labels'] = label_encoding['input_ids'].squeeze()  # Squeeze to remove unnecessary dimensions
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, y_train.values)
val_dataset = TweetDataset(val_encodings, y_val.values)


# print("")
# print(train_dataset[390].get(0))
# print("")
# print(train_dataset[390].get(1))

# for idx in range(len(train_dataset)):
#     item = train_dataset[idx]  # Get item at index idx
#     input_ids = item['input_ids']
#     attention_mask = item['attention_mask']
#     label = item['labels']

#     print(f"Input IDs: {input_ids}")
#     print(f"Attention Mask: {attention_mask}")
#     print(f"Label: {label}")

# print("")        #check some of the values manually to make sure they are all right
# print(train_dataset[0])  # Check the first item
# print(train_dataset[1])  # Check the second item
# print("\n\n\n\nThe 390s start here\n\n\n\n")
# print(train_dataset[390])
# print("\n\n")
# print(train_dataset[391]) 
# print("\n\n")
# print(train_dataset[392]) 
# print("\n\n")
# print(train_dataset[393])
# print("\n\n")
# print(train_dataset[394]) 
# print("\n\n")
# print(train_dataset[395])
# print("\n\n")
# print(train_dataset[396]) 
# print("\n\n")
# print(train_dataset[397])
# print("\n\n")
# print(train_dataset[398]) 
# print("\n\n")
# print(train_dataset[399])
# print("\n\n")
# print(train_dataset[400])

In [8]:



model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


# Trains the data and then tests the trained model to get the best results
# train_dataset is the list/tensor data which contains 90% of the training data to train the model
# val_dataset is the list/tensor data whic contains th remaining 10% of the training data to train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# import time

# print("here")

# counter = 0
# for text, string in zip(train_dataset, texts):
#     if(counter == 9):
#         counter = 0
#         time.sleep(10)
#     print(text)
#     print(string)
#     counter += 1

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,67.0925
20,66.4206
30,65.8179
40,65.1951
50,66.0505
60,64.3055
70,64.8993
80,62.4708
90,63.182
100,59.5988


TrainOutput(global_step=9276, training_loss=2.275316172183434, metrics={'train_runtime': 26965.3442, 'train_samples_per_second': 2.752, 'train_steps_per_second': 0.344, 'total_flos': 3831163287552000.0, 'train_loss': 2.275316172183434, 'epoch': 3.0})

# Save Model and Tokenizer

In [10]:
trainer.save_model('./justTrainedModel')
tokenizer.save_pretrained('./JustTrainedTokenizer')

('./JustTrainedTokenizer\\tokenizer_config.json',
 './JustTrainedTokenizer\\special_tokens_map.json',
 './JustTrainedTokenizer\\spiece.model',
 './JustTrainedTokenizer\\added_tokens.json')

# Generate Predictions for Test Set and Examine Accuracy

In [12]:
# Compare predictions with true labels
true_labels = test_df['sentiment'].tolist()
print(true_labels)

['neutral', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'negative', 'positive', 'neutral', 'neutral', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'negative', 'negative', 'neutral', 'neutral', 'negative', 'positive', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'positive', 'neutral', 'negative', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'positive', 'negative', 'negative', 'negative', 'positive', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neg

In [34]:
#maybe this version will work better

test_encodings = encode_data(test_df['text'])
test_dataset = TweetDataset(test_encodings, test_df['sentiment'])  # Tokenizing data

In [11]:
# Make predictions
predictions = trainer.predict(test_dataset)

# Decode the predictions
decoded_predictions = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

In [None]:
# Print the inputs and predictions
for text, predicted in zip(test_df['text'], decoded_predictions):
    print(f"Input: {text}\nPredicted: {predicted}\n")

In [None]:
#see how right/wrong it was
for text, actual, predicted in zip(test_df['text'], test_df['sentiment'], predictions):
    print(f"Input: {text}\nExpected: {actual}\nPredicted: {predicted}\n")

In [None]:
#Convert the predictions to actual langauge
predicted_texts = tokenizer.batch_decode(predictedSentiment.predictions, skip_special_tokens=True)

# Print the predictions
for text, pred in zip(test_texts, predictions):
    print(f"Input: {text}\nPrediction: {pred}\n")

# Code to Run Before Tweet Analysis

In [15]:
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import pandas as pd
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load Previously Pretrained Models (If Model was Unloaded or Before Election Analysis)

In [18]:
# Load the model
trained_model = MT5ForConditionalGeneration.from_pretrained('./justTrainedModel')

# Load the tokenizer
trained_tokenizer = MT5Tokenizer.from_pretrained('./JustTrainedTokenizer')

# Running Model on Trump Election Tweets

In [19]:
election_tweets_df = pd.read_csv('C:\\Users\\rhunt\\Downloads\\trump_harris_tweets.csv')

In [20]:
trump_results = list()

for cell in election_tweets_df['trump']:
    sample_tweet = cell
    # Example: Let's generate text for a few sample tweets
    # sample_tweet = election_tweets_df['trump'][0]

    # Tokenize the input text
    input_ids = trained_tokenizer.encode(sample_tweet, return_tensors="pt")

    # Generate sentiments (in an encoded form)
    generated_ids = trained_model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

    # Decode the sentiments
    generated_text = trained_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # add sentiment results to list of sentiments
    trump_results.append(generated_text)

    print("Generated Text: ", generated_text)
print("\n All...\n ", trump_results)

Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  negative
Generated Text:  positive

 All...
  ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'positive']


# Print Trump Tweet Results 

In [49]:
# Print the inputs and predictions
for text, predicted in zip(election_tweets_df['trump'], trump_results):
    print(f"Input: {text}\nPredicted: {predicted}\n")

Input: #Trump is literally asleep at the RNC event. Asleep. #TrumpAssasinationAttempt #Trump2024 #Trump
Predicted: neutral

Input: So... You didn't know what 45-47 meant? Now you do! #Trump
Predicted: neutral

Input: Rep. Marjorie Taylor Greene (R-GA.) calls out Rep. Jamie Raskin (D-MD.) and the Democrats after a subcommittee hearing in which Dr. Anthony Fauci testified about the COVID response. Do you agree with Rep. Greene’s statement?(Video via @RepMTG) #OAN #Hearing #MTG #Trump #Floyd
Predicted: neutral

Input: Didn't like the way this @JDVance1 bloke (newly picked as #Trump's running mate) slagged off the UK lately. Looked into him. Not great. Made a parody song inspired by @ABBA called "Vance VP" Shout out #americast (@BBCsarahsmith/@awzurcher) & @RestIsPolitics for profiles
Predicted: neutral

Input: “I’m a fast healer…” Also, Trump explains that it only hit the “lobe” as he touches the top of his ear. #Trump #TrumpPressConference
Predicted: neutral

Input: Apology accepted in a

# Running Model on Harris Election Tweets

In [50]:
harris_results = list()

for cell in election_tweets_df['harris']:
    sample_tweet = cell
    # Example: Let's generate text for a few sample tweets
    # sample_tweet = election_tweets_df['trump'][0]

    # Step 1: Tokenize the input text
    input_ids = trained_tokenizer.encode(sample_tweet, return_tensors="pt")

    # Step 2: Use the model's generate function to generate text
    # You can set parameters such as max_length, num_beams, do_sample, etc., for controlling the output
    generated_ids = trained_model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

    # Step 3: Decode the generated tokens back into text
    generated_text = trained_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    harris_results.append(generated_text)

    print("Generated Text: ", generated_text)
print("\n All...\n ", harris_results)

Generated Text:  negative
Generated Text:  neutral
Generated Text:  negative
Generated Text:  positive
Generated Text:  positive
Generated Text:  neutral
Generated Text:  neutral
Generated Text:  positive
Generated Text:  neutral
Generated Text:  positive

 All...
  ['negative', 'neutral', 'negative', 'positive', 'positive', 'neutral', 'neutral', 'positive', 'neutral', 'positive']


# Print Harris Tweet Results

In [51]:
# Print the inputs and predictions
for text, predicted in zip(election_tweets_df['harris'], harris_results):
    print(f"Input: {text}\nPredicted: {predicted}\n")

Input: As Donald Trump gets attacked today for his comments at the National Association of Black Journalists meeting in Chicago over his confusion about whether or not Kamala Harris is black or Indian, keep in mind this evidence showing she has identified as Indian. #KamalaHarris
Predicted: negative

Input: I'm the real deal, and I'm done with the Democrats. #DNCConvention2024 #DNC2024 #DNC #JoeBiden #Trump #KamalaHarris #RFKJr #politics #Democrat #Republican
Predicted: neutral

Input: #MeghanMarkle pulled a #KamalaHarris and said I'm not your Black boogeywoman punching bag for y'all to clout chase and clickbait off me for sport! Nothing is more powerful than refusing to accept abuse in the name of status. Put some respect in her STILL royal name! #Megexit
Predicted: negative

Input: An update on my voice, and talking about Kamala. Exciting stuff! #KamalaHarris #2024election #HarrisForPresident #Harris #KamalaHarris2024 #Harris2024
Predicted: positive

Input: So this happened Kamala Ha

# Not run stuff/Useless Code

In [8]:
# clear up garbage (This is here bc I was trying to clean up
# garbage to train the model on my GPU, but it did not work
# since there was too much data for my GPU apparently)
# import gc
# gc.collect()

In [33]:
#test_encodings = encode_data(test_df['text'])
#test_dataset = TweetDataset(test_encodings, [""] * len(test_df))  # Placeholder for labels
# torch.cuda.is_available()
# print(test_df['sentiment'])
# print(test_dataset[0])
# print(test_dataset[0]['labels'])
# predictedSentiment = trainer.predict(test_dataset)

In [7]:
#Save the models together for ease of use?
# trainer.save_model('./modelMetadataAndStuff')
# tokenizer.save_pretrained('./modelMetadataAndStuff')

In [None]:
# Evaluaion mode does not work!!!!
# Set the model to evaluation mode
# trainer.eval()

# predictions = []
# with torch.no_grad():
#     for item in test_dataset:
#         input_ids = item['input_ids'].unsqueeze(0)  # Add batch dimension
#         attention_mask = item['attention_mask'].unsqueeze(0)  # Add batch dimension
        
#         outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
#         decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         predictions.append(decoded_output)

In [29]:
#print("Now predicting....")
#predictions = trainer.predict(test_dataset)

In [None]:
#eval_results = trainer.evaluate()

In [None]:
#trainer.save_model('./savedModel')
#tokenizer.save_pretrained('./savedPretrainedTokenizer')

In [None]:
#model.save_pretrained('./savedPretrainedModel')

In [None]:
#my modeling instance is trainer
#torch.save(model.state_dict(), './saved_model.pth')
#torch.save(model, './saved_model_full.pth')

In [None]:
#AutoTokenizer.from_pretrained("path/to/save/model")