<a href="https://colab.research.google.com/github/Kamaljit12/HuggingFace/blob/main/Emothion_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

## load the csv file
df = pd.read_csv("/content/combined_emotion.csv")

In [116]:
# shape of the data
df.shape

(422746, 2)

In [117]:
# info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422746 entries, 0 to 422745
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sentence  422746 non-null  object
 1   emotion   422746 non-null  object
dtypes: object(2)
memory usage: 6.5+ MB


In [118]:
# check duplicates
df.duplicated().sum()

6623

In [119]:
# drop duplicated
df.drop_duplicates(inplace=True)

In [120]:
# check duplicates
df.duplicated().sum()

0

In [121]:
df = df.head(5000)

In [122]:
## clean the text: Remvoe special characters and lowercsting

def clean_text(text):
    # lower case of the text
    text = text.lower()
    # regex to remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text


In [123]:
# applied the above cleaning fucntion to clean data
df["sentence"] = df['sentence'].apply(clean_text)

In [124]:
df.head()

Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear


In [125]:
# function to count word of the sentences
def text_counts(text):
  words = text.split(" ")
  word_count = len(words)
  return word_count

In [126]:
df["total_words"] = df["sentence"].apply(text_counts)

In [127]:
## maximum number of the sentences
df['total_words'].max()

65

In [128]:
## counts of emotion category
df["emotion"].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,1704
sad,1434
anger,696
fear,596
love,382
suprise,188


In [129]:
# split the data for the training and for testing
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [179]:
train_data.loc[4193]['sentence']

'i feel about the monsters who did that to us i know that fanatics no matter how hateful amp murderous don t make an army'

In [105]:
# !pip install datasets

In [133]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

In [134]:
## initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenize the input prompts and generated recipes
train_encodings = tokenizer(list(train_data['sentence']), truncation=True, padding=True, max_length=67)
train_labels = tokenizer(list(train_data['emotion']), truncation=True, padding=True, max_length=67)

test_encodings = tokenizer(list(test_data['sentence']), truncation=True, padding=True, max_length=67)
test_labels = tokenizer(list(test_data['emotion']), truncation=True, padding=True, max_length=67)



In [171]:
train_encodings

2

In [136]:

# Create a custom dataset for use in the Trainer
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels['input_ids']
})


In [139]:
# Set up training arguments with validation logging
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Warmup steps
    weight_decay=0.01,  # Weight decay
    logging_dir='./logs',  # Logging directory
    logging_steps=10,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=1000,  # Save model checkpoints every 1000 steps
    load_best_model_at_end=True,  # Load the best model when finished
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
)

# Fine-tune the model
trainer.train()



Step,Training Loss,Validation Loss
500,0.2115,0.168125
1000,0.1381,0.140139
1500,0.1399,0.114296
2000,0.1108,0.104147
2500,0.0837,0.101532


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2500, training_loss=0.15541006350517272, metrics={'train_runtime': 259.4703, 'train_samples_per_second': 77.08, 'train_steps_per_second': 9.635, 'total_flos': 354214871040000.0, 'train_loss': 0.15541006350517272, 'epoch': 5.0})

In [140]:

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5_emotion_classifier')
tokenizer.save_pretrained('./fine_tuned_t5_emotion_classifier')

('./fine_tuned_t5_emotion_classifier/tokenizer_config.json',
 './fine_tuned_t5_emotion_classifier/special_tokens_map.json',
 './fine_tuned_t5_emotion_classifier/spiece.model',
 './fine_tuned_t5_emotion_classifier/added_tokens.json')

## Emotion Classifier

In [141]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

## laod the model and tokernizer

mdoel = T5ForConditionalGeneration.from_pretrained("/content/fine_tuned_t5_emotion_classifier")
tokenizer = T5Tokenizer.from_pretrained("/content/fine_tuned_t5_emotion_classifier")

In [144]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [145]:
device

device(type='cuda')

In [147]:
# model.to(device)

In [165]:
# Function to generate a recipe from a prompt
def generate_emotion(input, model, tokenizer, max_length):
    prompt = clean_text(input)  # Clean the input prompt
    inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=67)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the recipe
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

    return tokenizer.decode(output[0], skip_special_tokens=True)





In [180]:
# Test the recipe generation
sentence = "i feel about the monsters who did that to us i know that fanatics no matter how hateful amp murderous don t make an army"
result = generate_emotion(input=sentence, model=model, tokenizer=tokenizer, max_length=67)
print(result)


anger
