In [None]:
!pip install pandas transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Load datasets
train_data = pd.read_csv('twitter_training.csv')
valid_data = pd.read_csv('twitter_validation.csv')
train_data = train_data.dropna(subset=['tweet'])
valid_data = valid_data.dropna(subset=['tweet'])
# Preprocessing function to map text labels to integers
label_mapping = {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}
train_data['labels'] = train_data['true_label'].map(label_mapping)
valid_data['labels'] = valid_data['true_label'].map(label_mapping)

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Setup tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create datasets
train_dataset = TweetDataset(train_data['tweet'].tolist(), train_data['labels'].tolist(), tokenizer)
valid_dataset = TweetDataset(valid_data['tweet'].tolist(), valid_data['labels'].tolist(), tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# Load model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-6)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 1.179345965385437
Epoch 2, Loss: 0.5069732666015625
Epoch 3, Loss: 0.44438958168029785


In [None]:
import torch

In [None]:
# Save the trained model
model_save_path = "twitter_sentiment_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to twitter_sentiment_model.pth


In [None]:
# Validation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=label_mapping.keys(), digits=4))
    print("Accuracy:", accuracy_score(true_labels, predictions))

# Evaluate the model
evaluate(model, valid_loader)

              precision    recall  f1-score   support

  Irrelevant     0.8790    0.8023    0.8389       172
    Negative     0.9318    0.9248    0.9283       266
     Neutral     0.8702    0.8000    0.8336       285
    Positive     0.8423    0.9639    0.8990       277

    accuracy                         0.8790      1000
   macro avg     0.8808    0.8728    0.8750      1000
weighted avg     0.8804    0.8790    0.8778      1000

Accuracy: 0.879


In [None]:
import pickle
import torch
from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification


In [None]:
# Save the DistilBERT tweet model
tweet_model_path = 'tweet_model.pkl'
with open(tweet_model_path, 'wb') as f:
    pickle.dump(model.state_dict(), f)

In [None]:
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
from torch.nn.functional import softmax


In [None]:
# Function to load the model
def load_model(model_path, model_class, tokenizer_class, pretrained_model_name, num_labels, device):
    model = model_class.from_pretrained(pretrained_model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    return model, tokenizer

# Load the saved model
model_path = "twitter_sentiment_model.pth"
num_labels = 4
model, tokenizer = load_model(model_path, RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base', len(label_mapping), device)

# Function to predict sentiment for a given sample text
def predict_sentiment(sample_text, model, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        sample_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).cpu().numpy()[0]
    confidence_score = torch.max(probs, dim=-1).values.cpu().numpy()[0]
    sentiment_labels = {0: 'Irrelevant', 1: 'Negative', 2: 'Neutral', 3: 'Positive'}
    predicted_label = sentiment_labels[predicted_class]
    return predicted_label, confidence_score

# Example usage
sample_text = "I really enjoy using this new product. It works wonders!"
predicted_sentiment, confidence_score = predict_sentiment(sample_text, model, tokenizer)
print(f"The predicted sentiment for the text is: {predicted_sentiment} with a confidence score of {confidence_score:.2f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The predicted sentiment for the text is: Positive with a confidence score of 0.84


In [None]:
import pandas as pd

# Load the data from the Excel file
file_path = 'Reviews(3).xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataframe and its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20978 entries, 0 to 20977
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Id                      20978 non-null  int64 
 1   ProductId               20978 non-null  object
 2   UserId                  20978 non-null  object
 3   ProfileName             20973 non-null  object
 4   HelpfulnessNumerator    20978 non-null  int64 
 5   HelpfulnessDenominator  20978 non-null  int64 
 6   Score                   20978 non-null  int64 
 7   Time                    20978 non-null  int64 
 8   Summary                 20978 non-null  object
 9   Text                    20978 non-null  object
dtypes: int64(5), object(5)
memory usage: 1.6+ MB


(   Id   ProductId          UserId                      ProfileName  \
 0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
 1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
 2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
 3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
 4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   
 
    HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
 0                     1                       1      5  1303862400   
 1                     0                       0      1  1346976000   
 2                     1                       1      4  1219017600   
 3                     3                       3      2  1307923200   
 4                     0                       0      5  1350777600   
 
                  Summary                                               Text  
 0  Good Quality Dog Food  I have bought several of the Vitality 

In [None]:
# Drop rows with any null values
data_cleaned = data.dropna()

# Categorize the 'Score' field
def categorize_score(score):
    if score == 4 or score == 5:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    elif score == 2:
        return 'Negative'
    else:
        return 'Irrelevant'

data_cleaned['Sentiment'] = data_cleaned['Score'].apply(categorize_score)

# Drop unnecessary columns
data_cleaned = data_cleaned[['Text', 'Sentiment']]

# Display the updated data and check the distribution of the categories
data_cleaned.head(), data_cleaned['Sentiment'].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Sentiment'] = data_cleaned['Score'].apply(categorize_score)


(                                                Text   Sentiment
 0  I have bought several of the Vitality canned d...    Positive
 1  Product arrived labeled as Jumbo Salted Peanut...  Irrelevant
 2  This is a confection that has been around a fe...    Positive
 3  If you are looking for the secret ingredient i...    Negative
 4  Great taffy at a great price.  There was a wid...    Positive,
 Sentiment
 Positive      16135
 Irrelevant     1914
 Neutral        1750
 Negative       1174
 Name: count, dtype: int64)

In [None]:
!pip install datasets




In [None]:
!pip install accelerate==0.30.0
!pip install transformers[torch]



In [None]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding="max_length", truncation=True, max_length=128)

# Convert the cleaned data to Hugging Face dataset format
data_set = Dataset.from_pandas(data_cleaned)
data_set = data_set.map(tokenize_function, batched=True)

# Split the data into train, validation, and test sets
train_val_test = data_set.train_test_split(test_size=0.2, seed=42)
test_set = train_val_test['test'].train_test_split(test_size=0.5, seed=42)
train_set, val_set, test_set = train_val_test['train'], test_set['train'], test_set['test']

# Convert the 'Sentiment' labels into integers
label_dict = {'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant': 3}

# Convert the 'Sentiment' labels into integers for all datasets
def convert_to_labels(batch):
    batch['labels'] = [label_dict[sentiment] for sentiment in batch['Sentiment']]
    return batch

train_set = train_set.map(convert_to_labels, batched=True)
val_set = val_set.map(convert_to_labels, batched=True)
test_set = test_set.map(convert_to_labels, batched=True)

# train_set = train_set.map(lambda examples: {'labels': label_dict[examples['Sentiment']]}, batched=True)
# val_set = val_set.map(lambda examples: {'labels': label_dict[examples['Sentiment']]}, batched=True)
# test_set = test_set.map(lambda examples: {'labels': label_dict[examples['Sentiment']]}, batched=True)

# Initialize the DistilBERT model for sequence classification with 4 classes
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)





Map:   0%|          | 0/20973 [00:00<?, ? examples/s]

Map:   0%|          | 0/16778 [00:00<?, ? examples/s]

Map:   0%|          | 0/2097 [00:00<?, ? examples/s]

Map:   0%|          | 0/2098 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_set,
#     eval_dataset=val_set
# )

# # Train the model
# trainer.train()


In [None]:
import numpy as np

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EvalPrediction

# Define a function to compute the evaluation metrics
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Update the Trainer instance to include the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.431,0.502982,0.824988,0.800985,0.800077,0.824988
2,0.4156,0.443517,0.837387,0.83448,0.832244,0.837387
3,0.2149,0.54088,0.839294,0.835626,0.83278,0.839294


TrainOutput(global_step=3147, training_loss=0.3818666279259128, metrics={'train_runtime': 609.7105, 'train_samples_per_second': 82.554, 'train_steps_per_second': 5.161, 'total_flos': 1666962964703232.0, 'train_loss': 0.3818666279259128, 'epoch': 3.0})

In [None]:
!pip install torch

In [None]:
from torch.nn.functional import softmax
import torch

In [None]:

# Save the trained model
model_save_path = "Prod_sentiment_analysis_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to Prod_sentiment_analysis_model.pth


In [None]:
import pickle
import torch
from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification


In [None]:
# Save the DistilBERT tweet model
Prod_model_path = 'Prod_model.pkl'
with open(Prod_model_path, 'wb') as f:
    pickle.dump(model.state_dict(), f)

In [None]:

# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_set)

# Print out the test results
print(test_results)

{'eval_loss': 0.49358344078063965, 'eval_accuracy': 0.8503336510962822, 'eval_f1': 0.847066261313686, 'eval_precision': 0.8445302921743463, 'eval_recall': 0.8503336510962822, 'eval_runtime': 7.8172, 'eval_samples_per_second': 268.384, 'eval_steps_per_second': 16.886, 'epoch': 3.0}


In [None]:
# Function to load the model
def load_model(model_path, model_class, tokenizer_class, pretrained_model_name, num_labels):
    model = model_class.from_pretrained(pretrained_model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    model.eval()
    return model, tokenizer

# Load the saved model with the correct number of labels
model_path = "Prod_sentiment_analysis_model.pth"
num_labels = 4  # Ensure this matches the number of labels used during training
model, tokenizer = load_model(model_path, DistilBertForSequenceClassification, DistilBertTokenizerFast, 'distilbert-base-uncased', num_labels)

# Function to predict sentiment for a given sample text
def predict_sentiment(sample_text, model, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        sample_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).cpu().numpy()[0]
    confidence_score = torch.max(probs, dim=-1).values.cpu().numpy()[0]
    sentiment_labels = {0: 'Positive', 1: 'Neutral', 2: 'Negative', 3: 'Irrelevant'}
    predicted_label = sentiment_labels[predicted_class]
    return predicted_label, confidence_score

# Example usage
sample_text = "I really enjoy using this new product. It works wonders!"
predicted_sentiment, confidence_score = predict_sentiment(sample_text, model, tokenizer)
print(f"The predicted sentiment for the text is: {predicted_sentiment} with a confidence score of {confidence_score:.2f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The predicted sentiment for the text is: Positive with a confidence score of 1.00


In [None]:
!pip install transformers
!pip install scikit-learn
!pip install -U transformers




In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset

# Load the "emotion" dataset from Hugging Face
dataset = load_dataset('emotion')

# Get the unique label names directly from the dataset's feature information
label_names = dataset['train'].features['label'].names
label_mapping = {label_name: index for index, label_name in enumerate(label_names)}

# Print the mapping for verification
print("Label Mapping:", label_mapping)

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Create datasets for training, validation, and testing
train_texts = dataset['train']['text']
train_labels = [int(label) for label in dataset['train']['label']]
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)

val_texts = dataset['validation']['text']
val_labels = [int(label) for label in dataset['validation']['label']]
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

test_texts = dataset['test']['text']
test_labels = [int(label) for label in dataset['test']['label']]
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

# DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_names))
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Save the trained model
model_save_path = "emotion_detection_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Label Mapping: {'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.10715717822313309
Epoch 2, Loss: 0.07152421027421951
Epoch 3, Loss: 0.03477538377046585
Model saved to emotion_detection_model.pth


In [None]:
emotion_model_path = 'emotion_model.pkl'
with open(emotion_model_path, 'wb') as f:
    pickle.dump(model.state_dict(), f)

In [None]:
# Function to evaluate the model
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())
    print(classification_report(true_labels, predictions, target_names=label_names, digits=4))
    print("Accuracy:", accuracy_score(true_labels, predictions))

# Evaluate the model on the validation set
print("Validation Results:")
evaluate(model, val_loader)

# Evaluate the model on the test set
print("Test Results:")
evaluate(model, test_loader)

Validation Results:
              precision    recall  f1-score   support

     sadness     0.9623    0.9745    0.9684       550
         joy     0.9680    0.9460    0.9569       704
        love     0.8469    0.9326    0.8877       178
       anger     0.9624    0.9309    0.9464       275
        fear     0.9190    0.9104    0.9147       212
    surprise     0.8434    0.8642    0.8537        81

    accuracy                         0.9435      2000
   macro avg     0.9170    0.9264    0.9213      2000
weighted avg     0.9447    0.9435    0.9438      2000

Accuracy: 0.9435
Test Results:
              precision    recall  f1-score   support

     sadness     0.9610    0.9759    0.9684       581
         joy     0.9701    0.9338    0.9516       695
        love     0.7935    0.9182    0.8513       159
       anger     0.9542    0.9091    0.9311       275
        fear     0.8973    0.8973    0.8973       224
    surprise     0.7183    0.7727    0.7445        66

    accuracy              

In [None]:
# Function to load the model
def load_model(model_path, model_class, tokenizer_class, pretrained_model_name, num_labels):
    model = model_class.from_pretrained(pretrained_model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    model.eval()
    return model, tokenizer

# Load the saved model
model_path = "emotion_detection_model.pth"
num_labels=6
model, tokenizer = load_model(model_path, DistilBertForSequenceClassification, DistilBertTokenizerFast, 'distilbert-base-uncased',num_labels)

# Function to predict emotion for a given sample text
def predict_emotion(sample_text, model, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        sample_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).cpu().numpy()[0]
    confidence_score = torch.max(probs, dim=-1).values.cpu().numpy()[0]
    predicted_label = label_names[predicted_class]
    return predicted_label, confidence_score

# Example usage
sample_text = "I'm feeling great today!"
predicted_emotion, confidence_score = predict_emotion(sample_text, model, tokenizer)
print(f"The predicted emotion for the text is: {predicted_emotion} with a confidence score of {confidence_score:.2f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The predicted emotion for the text is: joy with a confidence score of 1.00


In [None]:
# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=label_names, digits=4))
    print("Accuracy:", accuracy_score(true_labels, predictions))

# Evaluate the model on the validation set
print("Validation Results:")
evaluate(model, val_loader)

# Evaluate the model on the test set
print("Test Results:")
evaluate(model, test_loader)



In [None]:
!pip install flask-ngrok
!pip install transformers
!pip install torch
!pip install pyngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [None]:
from flask import Flask, request, jsonify, render_template_string
from flask_ngrok import run_with_ngrok
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification
from torch.nn.functional import softmax
import pickle

app = Flask(__name__)
run_with_ngrok(app)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the emotion model and tokenizer (6 classes)
emotion_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)
with open('emotion_model.pkl', 'rb') as f:
    emotion_model.load_state_dict(pickle.load(f))
emotion_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
emotion_model.to(device)
emotion_model.eval()

# Load the tweet model and tokenizer (4 classes)
review_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
with open('Prod_model.pkl', 'rb') as f:
    review_model.load_state_dict(pickle.load(f))
review_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
review_model.to(device)
review_model.eval()

# Load the review model and tokenizer (4 classes)
# tweet_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
# with open('tweet_model.pkl', 'rb') as f:
#     tweet_model.load_state_dict(pickle.load(f))
# tweet_model_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
# tweet_model.to(device)
# tweet_model.eval()

# Mapping for sentiment labels (4 classes)
sentiment_labels = {0: 'Positive', 1: 'Neutral', 2: 'Negative', 3: 'Irrelevant'}

# Mapping for emotion labels (6 classes)
emotion_labels = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

def predict_sentiment(model, tokenizer, text, labels_mapping):
    # Tokenize the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt',
    )

    # Move tensors to the same device as the model
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Convert logits to probabilities and then to class labels
    probs = softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).item()  # Get the predicted class index
    confidence_score = torch.max(probs).item()  # Get the confidence score

    # Map the predicted class index back to the class label
    predicted_label = labels_mapping[predicted_class]
    return predicted_label, confidence_score

@app.route('/')
def home():
    return render_template_string('''
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <title>Sentiment Analysis</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                h1 { color: #333; }
                form { margin-bottom: 20px; }
                textarea { width: 100%; height: 100px; padding: 10px; font-size: 16px; }
                input[type="submit"] { padding: 10px 20px; font-size: 16px; }
                #result { margin-top: 20px; padding: 10px; border: 1px solid #ccc; }
            </style>
        </head>
        <body>
            <h1>Sentiment Analysis</h1>
            <form action="/predict" method="post">
                <textarea name="text" placeholder="Enter your text here..."></textarea><br>
                <input type="submit" value="Analyze">
            </form>
            <div id="result"></div>
            <script>
                document.querySelector('form').addEventListener('submit', function(event) {
                    event.preventDefault();
                    var text = document.querySelector('textarea').value;
                    fetch('/predict', {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
                        body: 'text=' + encodeURIComponent(text)
                    }).then(response => response.json())
                      .then(data => {
                          document.getElementById('result').innerText = 'Sentiment: ' + data.sentiment + ', Score: ' + data.score;
                      });
                });
            </script>
        </body>
        </html>
    ''')

@app.route('/predict', methods=['POST'])
def predict():
    text = request.form['text']

    emotion_pred, emotion_score = predict_sentiment(emotion_model, emotion_tokenizer, text, emotion_labels)
    # tweet_pred, tweet_score = predict_sentiment(tweet_model, tweet_tokenizer, text, sentiment_labels)
    review_pred, review_score = predict_sentiment(review_model, review_tokenizer, text, sentiment_labels)

    # Choose the prediction with the highest score
    if emotion_score > tweet_score and emotion_score > review_score:
        final_pred = emotion_pred
        final_score = emotion_score
    # elif tweet_score > review_score:
    #     final_pred = tweet_pred
    #     final_score = tweet_score
    else:
        final_pred = review_pred
        final_score = review_score

    return jsonify({'sentiment': final_pred, 'score': final_score})

if __name__ == "__main__":
    app.run()


In [None]:
!python app.py


python3: can't open file '/content/app.py': [Errno 2] No such file or directory
