In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/final-dataset/Datathon_TrainData_org.csv


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re



In [3]:
# Load your dataset
data = pd.read_csv('/kaggle/input/final-dataset/Datathon_TrainData_org.csv')

In [4]:
def clean_text(text):
    if isinstance(text, str):
        # Remove HTML tags and URLs
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        # Remove non-alphanumeric characters and extra spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
# Handle missing values
data.fillna('', inplace=True)

In [6]:
# Convert the 'Emotion' column to strings
data['Emotion'] = data['Emotion'].astype(str)

In [7]:
# Encode categorical variables (Country, Speaker, Designation, Running President/PM)
data = pd.get_dummies(data, columns=['Country', 'Speaker', 'Designation', 'Running President/PM'], drop_first=True)

In [8]:
# Prepare the data
X = data['Text_of_Speech']
y = data['Emotion']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
# Create a label encoder
label_encoder = LabelEncoder()

In [13]:
# Fit the label encoder on the emotion labels and transform them to numerical values
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [18]:
num_labels=len(label_encoder.classes_)
num_labels

5

In [14]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Tokenize and encode the training data
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512)
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512)

In [20]:
# Convert the encoded inputs to PyTorch tensors
X_train_input_ids = X_train_encoded['input_ids']
X_train_attention_mask = X_train_encoded['attention_mask']
X_test_input_ids = X_test_encoded['input_ids']
X_test_attention_mask = X_test_encoded['attention_mask']

In [21]:
# Convert labels to PyTorch tensors
y_train_tensor = torch.tensor(y_train_encoded)
y_test_tensor = torch.tensor(y_test_encoded)

In [22]:
# Create DataLoader for training data
train_data = TensorDataset(X_train_input_ids, X_train_attention_mask, y_train_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

In [23]:
# Create DataLoader for testing data
test_data = TensorDataset(X_test_input_ids, X_test_attention_mask, y_test_tensor)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

In [24]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)



In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
# Define the number of training epochs
num_epochs = 3  # You can change this number to the desired number of epochs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        model.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Print progress after each epoch
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {total_loss / len(train_dataloader)}")

print("Training complete.")

Epoch 1/3 - Average Loss: 1.3490500773912595
Epoch 2/3 - Average Loss: 1.1372075029361395
Epoch 3/3 - Average Loss: 1.0171110872869138
Training complete.


In [27]:
# Evaluation
model.eval()
y_true = []
y_pred = []

In [28]:
for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None
        )

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

    y_true.extend(labels.cpu().numpy())
    y_pred.extend(predictions.cpu().numpy())

In [29]:
# Decode numerical labels back to original emotions
y_true = label_encoder.inverse_transform(y_true)
y_pred = label_encoder.inverse_transform(y_pred)

In [30]:
accuracy = accuracy_score(y_true, y_pred)
classification_rep = classification_report(y_true, y_pred)
confusion_mat = confusion_matrix(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
print(f"Accuracy: {accuracy}")
print(classification_rep)
print("Confusion Matrix:")
print(confusion_mat)

Accuracy: 0.5339506172839507
              precision    recall  f1-score   support

                   0.00      0.00      0.00         5
         JOY       0.61      0.64      0.63        87
     NEUTRAL       0.52      0.54      0.53       106
    OPTIMISM       0.51      0.38      0.44        76
       UPSET       0.47      0.62      0.53        50

    accuracy                           0.53       324
   macro avg       0.42      0.44      0.43       324
weighted avg       0.53      0.53      0.53       324

Confusion Matrix:
[[ 0  5  0  0  0]
 [ 0 56 14 11  6]
 [ 0 15 57 15 19]
 [ 0 11 26 29 10]
 [ 0  5 12  2 31]]


In [32]:
import joblib

In [33]:
# Save the trained model
torch.save(model.state_dict(), 'bert_emotion_model.pth')

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [34]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import joblib

# Load the saved label encoder
label_encoder = joblib.load('/kaggle/working/label_encoder.pkl')

# Create a new instance of the model architecture
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Load the trained model weights
model.load_state_dict(torch.load('/kaggle/working/bert_emotion_model.pth'))
model.eval()

# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to make predictions
def predict_emotion(input_text):
    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Convert the predicted class back to emotion label
    predicted_emotion = label_encoder.inverse_transform([predicted_class])
    
    return predicted_emotion[0]

# Example usage
user_input = "This is a test speech"
predicted_emotion = predict_emotion(user_input)
print(f"Predicted Emotion: {predicted_emotion}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Emotion: JOY


In [36]:
testdf=pd.read_csv('/kaggle/input/sample/TestData_org.csv')

In [39]:
testdf=testdf[['Text_of_Speech']]

In [40]:
testdf

Unnamed: 0,Text_of_Speech
0,"Good evening, everybody. Please have a seat. ..."
1,"Colleagues, we are meeting out of schedule tod..."
2,"Now, one other thing I want to point out -- so..."
3,"Good afternoon, everyone. I just had the oppo..."
4,"Well, I’ve got my team here to talk about Ebo..."
...,...
397,THE PRESIDENT: Thank you. Thank you very muc...
398,"Good afternoon, dear colleagues,\n\nToday we a..."
399,"Colleagues, welcome to this traditional meetin..."
400,"Hi everybody. Right now, the United States S..."


In [48]:
output_csv_path = '/kaggle/working/outputtttt.csv'

# Convert the DataFrame to a CSV file
testdf.to_csv(output_csv_path, index=False)

In [51]:
import pandas as pd

# Load your input CSV file
input_csv_path = '/kaggle/working/outputtttt.csv'  # Update with the path to your CSV file
output_csv_path = '/kaggle/working/sampleoutput.csv'  # Update with the desired path for the output CSV file

# Read the input CSV file into a DataFrame
input_df = pd.read_csv(input_csv_path)

# Create an empty list to store the predicted emotions
predicted_emotions = []

# Iterate through each row in the input DataFrame
for index, row in input_df.iterrows():
    input_text = row['Text_of_Speech']  # Replace 'Text_of_Speech' with the actual name of the text column
    predicted_emotion = predict_emotion(input_text)
    predicted_emotions.append(predicted_emotion)

# Add the predicted emotions to the input DataFrame
input_df['predicted_emotion'] = predicted_emotions

# Save the updated DataFrame to the output CSV file
input_df.to_csv(output_csv_path, index=False)


KeyboardInterrupt: 

In [54]:
out=pd.read_csv('/kaggle/working/outputtttt.csv')

In [56]:
a=out["Text_of_Speech"]

In [57]:
a

0      Good evening, everybody.  Please have a seat. ...
1      Colleagues, we are meeting out of schedule tod...
2      Now, one other thing I want to point out -- so...
3      Good afternoon, everyone.  I just had the oppo...
4       Well, I’ve got my team here to talk about Ebo...
                             ...                        
397    THE PRESIDENT:  Thank you.  Thank you very muc...
398    Good afternoon, dear colleagues,\n\nToday we a...
399    Colleagues, welcome to this traditional meetin...
400     Hi everybody.  Right now, the United States S...
401    THE PRESIDENT:  Jocelyn, thank you.  Please.  ...
Name: Text_of_Speech, Length: 402, dtype: object

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import joblib

# Load the saved label encoder
label_encoder = joblib.load('/kaggle/working/label_encoder.pkl')

# Create a new instance of the model architecture
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Load the trained model weights
model.load_state_dict(torch.load('/kaggle/working/bert_emotion_model.pth'))
model.eval()

# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to make predictions
def predict_emotion(input_text):
    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Convert the predicted class back to emotion label
    predicted_emotion = label_encoder.inverse_transform([predicted_class])
    
    return predicted_emotion[0]

# Example usage
result=[]
for i in a:
    user_input = i
    predicted_emotion = predict_emotion(user_input)
    result.append(predicted_emotion)
#print(f"Predicted Emotion: {predicted_emotion}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['JOY',
 'UPSET',
 'OPTIMISM',
 'UPSET',
 'UPSET',
 'OPTIMISM',
 'NEUTRAL',
 'UPSET',
 'OPTIMISM',
 'NEUTRAL',
 'JOY',
 'NEUTRAL',
 'NEUTRAL',
 'UPSET',
 'NEUTRAL',
 'JOY',
 'JOY',
 'UPSET',
 'JOY',
 'NEUTRAL',
 'NEUTRAL',
 'OPTIMISM',
 'JOY',
 'NEUTRAL',
 'OPTIMISM',
 'NEUTRAL',
 'JOY',
 'UPSET',
 'NEUTRAL',
 'OPTIMISM',
 'OPTIMISM',
 'NEUTRAL',
 'JOY',
 'UPSET',
 'UPSET',
 'NEUTRAL',
 'NEUTRAL',
 'UPSET',
 'OPTIMISM',
 'OPTIMISM',
 'OPTIMISM',
 'JOY',
 'OPTIMISM',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'JOY',
 'NEUTRAL',
 'NEUTRAL',
 'UPSET',
 'OPTIMISM',
 'UPSET',
 'UPSET',
 'OPTIMISM',
 'JOY',
 'JOY',
 'OPTIMISM',
 'NEUTRAL',
 'UPSET',
 'UPSET',
 'JOY',
 'NEUTRAL',
 'OPTIMISM',
 'NEUTRAL',
 'OPTIMISM',
 'UPSET',
 'JOY',
 'OPTIMISM',
 'JOY',
 'OPTIMISM',
 'OPTIMISM',
 'UPSET',
 'OPTIMISM',
 'NEUTRAL',
 'JOY',
 'JOY',
 'NEUTRAL',
 'OPTIMISM',
 'NEUTRAL',
 'NEUTRAL',
 'JOY',
 'NEUTRAL',
 'OPTIMISM',
 'NEUTRAL',
 'UPSET',
 'JOY',
 'OPTIMISM',
 'OPTIMISM',
 'NEUTRAL',
 'UPSET',
 'NEUTRAL'

In [69]:
output_csv_path = '/kaggle/working/Rizwan1.csv'

In [70]:
import csv

In [71]:
with open(output_csv_path,mode='w',newline='') as file:
    writer=csv.writer(file)
    writer.writerow(result)

In [62]:
result.to_csv(output_csv_path, index=False)

AttributeError: 'list' object has no attribute 'to_csv'

In [72]:
horr=pd.read_csv('Rizwan1.csv')

In [74]:


# Use the stack() method to convert the row elements into a single column
df_stacked = horr.stack().reset_index(drop=True).to_frame(name='result')

# Save the resulting DataFrame to a CSV file
df_stacked.to_csv('output.csv', index=False)

In [76]:
res=pd.read_csv('/kaggle/working/output.csv')
res

Unnamed: 0,result


In [77]:
with open("output.csv", newline = "",mode = "w") as file:
    writer = csv.writer(file)
    writer.writerows([item] for item in result)