# Imports


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split, KFold
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import itertools

from torch import nn
from transformers import BertModel, BertPreTrainedModel
from transformers import BertConfig
import torch.nn.functional as F
from torch.optim import Adam

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import re
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from unicodedata import normalize
from nltk.stem import SnowballStemmer

import random
rn = random.uniform(0, 0.05)


def set_seed(seed):
    random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)
#         torch.backends.cudnn.deterministic = True
#         torch.backends.cudnn.benchmark = False

# Set seed for reproducibility
set_seed(22)

print("Imported Packages...")

# Helper Functions

In [None]:
def preprocessing(data):
    """
    Preprocess tweets by cleaning, removing stopwords, and stemming.
    
    Args:
    - data (DataFrame): DataFrame containing tweets and sentiments.
    
    Returns:
    - data (DataFrame): Preprocessed DataFrame with cleaned tweets.
    """
    
    tweets = []
    sentiment = []

    for index, tweet in data.iterrows():
        words_cleaned = ""
        tweet_clean = tweet.content.lower()
    
        words_cleaned = " ".join([word for word in tweet_clean.split()
                                  if 'http://' not in word
                                  and 'https://' not in word
                                  and '.com' not in word
                                  and '.es' not in word
                                  and word != 'rt'])
        
        # Remove only # and @ characters
        tweet_clean = re.sub(r'[@#]', '', words_cleaned)
        
        # Perform additional cleaning steps
        tweet_clean = re.sub(r'\b([jh]*[aeiou]*[jh]+[aeiou]*)*\b', "", tweet_clean)
        tweet_clean = re.sub(r'(.)\1{2,}', r'\1', tweet_clean)
        tweet_clean = re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
            normalize("NFD", tweet_clean), 0, re.I)
        tweet_clean = re.sub("[^a-zA-Z]", " ", tweet_clean)
        tweet_clean = re.sub("\t", " ", tweet_clean)
        tweet_clean = re.sub(" +", " ", tweet_clean) 
        tweet_clean = re.sub("^ ", "", tweet_clean)
        tweet_clean = re.sub(" $", "", tweet_clean)
        tweet_clean = re.sub("\n", "", tweet_clean)
        
        words_cleaned = ""
        stemmed = ""
        
        stop_words = set(stopwords.words('english'))
        stemmer = SnowballStemmer('english')
        
        tokens = word_tokenize(tweet_clean)
        
        words_cleaned = [word for word in tokens if word not in stop_words]
        stemmed = " ".join([stemmer.stem(word) for word in words_cleaned])
        
        sentiment.append(tweet.sentiment)
        tweets.append(stemmed)
    
    data['Content'] = tweets
    data['Sentiment'] = sentiment
    data = data[['Sentiment', 'Content']]
    
    return data

print("Helper Functions loaded.")

# Reading Data

In [None]:
data = pd.read_csv("/kaggle/input/isear-dataset/eng_dataset.csv")
data.drop(["ID"], axis=1, inplace=True)
data.head()


In [None]:
sentiment_counts = data.sentiment.value_counts()
plt.figure(figsize=(8, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values, color='skyblue')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Distribution')
plt.show()

In [None]:
words = []
letters = []

for index, text in data.iterrows():
    letters.append(len(text.content))
    words.append(len(text.content.split()))

data['Words'] = words
data['Letters'] = letters

grouped_data = data.groupby('sentiment')
avg_words = grouped_data['Words'].mean()
avg_letters = grouped_data['Letters'].mean()

grouped_data = data.groupby('sentiment')

# Boxplot for average number of characters per sentiment class
fig, ax = plt.subplots(figsize=(6, 5))
ax.boxplot([group['Letters'] for name, group in grouped_data], labels=grouped_data.groups.keys())
ax.set_title("Number of Characters per Sentiment Class")
ax.set_xlabel('Sentiment Class')
ax.set_ylabel('Number of Characters')
plt.show()

In [None]:
# Boxplot for average number of words per sentiment class
fig, ax = plt.subplots(figsize=(6, 5))
ax.boxplot([group['Words'] for name, group in grouped_data], labels=grouped_data.groups.keys())
ax.set_title("Number of Words per Sentiment Class")
ax.set_xlabel('Sentiment Class')
ax.set_ylabel('Number of Words')
plt.show()


In [None]:
# Boxplot for characters distribution across all data
fig, ax = plt.subplots(figsize=(6, 5))
ax.boxplot(data['Letters'])
ax.set_title("Overall Characters Distribution")
ax.set_ylabel('Number of Characters')
plt.show()


In [None]:
# Boxplot for words distribution across all data
fig, ax = plt.subplots(figsize=(6, 5))
ax.boxplot(data['Words'])
ax.set_title("Overall Words Distribution")
ax.set_ylabel('Number of Words')
plt.show()


# Data Cleaning and Transformation

In [None]:
data_cleaned = preprocessing(data)
data_cleaned = data_cleaned.loc[:,['Sentiment','Content']]
data_copy = data_cleaned.copy()
data_cleaned.head()

In [None]:
label_mapping = {"anger": 0, "fear": 1, "joy": 2, "sadness": 3}
data_cleaned['Sentiment'] = data_cleaned['Sentiment'].map(label_mapping)
data_cleaned.head()

In [None]:
df_train, df_test = train_test_split(data_cleaned, test_size=0.2, random_state=22, stratify=data_cleaned['Sentiment'])
df_train1 = data_cleaned.copy()
print(f'Training set size: {df_train.shape[0]}')
print(f'Testing set size: {df_test.shape[0]}')

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = SentimentDataset(
        texts=df.Content.to_numpy(),
        labels=df.Sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(dataset, batch_size=batch_size, num_workers=4)