# Basic

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.0f' % x)
pd.options.display.max_colwidth=None
pd.options.display.max_rows=100

from AugmentedSocialScientist.models import Camembert

In [2]:
import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lvshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_with_emoji = pd.read_csv('matched_texts.csv')

In [6]:
# Load French Model
nlp = spacy.load('fr_core_news_sm')
# Extend the length that spaCy can deal with
nlp.max_length = 10000000 # 🐰This place can be changed depending on the size of the data
# Load stopWords and add more stopWords, to ensure that the stop words will not influence the count of the frequent words
stopWords = set(stopwords.words('french'))
other_stop_words = ['RT', ':', '«', '»', '#']

extended_stop_words = stopWords.union(set(other_stop_words))
# Define the function to preprocess texts
def preprocess_text_paragraph(text):
    text = re.sub(r'@\w+[\w\.-]*', '', text)
    doc = nlp(text)
    clean_tokens = [str(token).lower() for token in doc if str(token).lower() not in stopWords and str(token).strip() not in extended_stop_words]
    clean_tokens = [token for token in clean_tokens if token and token.strip()]
    clean_tokens = [token for token in clean_tokens if "http" not in token]
    clean_text = ' '.join(clean_tokens)
    return clean_text

In [7]:
df_with_emoji.text = df_with_emoji.text.apply(lambda x: preprocess_text_paragraph(x))

In [8]:
df_with_emoji.head(3)

Unnamed: 0,id,abusetag2,tigger.predict,tigger.predprob,tiggerpred_0,tiggerpred_1,text
0,1630631004384681984,0,0,1,1,0,journaliste bfmtv benjamin duhamel a tutoyé garde sceaux eric dupond - moretti lors d' interview télévisée matin . simple erreur lapsus révélateur d' certain copinage ? 🤔
1,1626703517661507584,0,0,1,1,0,star pénible débat restera voici nouvel extrait lequel ratatine encore agités bocal c’ délectable .
2,1622700351328514048,0,0,1,1,0,🔴 🔥 dis manière très sincère . n' droit mettre genoux gens tiennent france debout ! incroyable discours députée contre macronie honteuse réformedesretraites ! ✊ directan


In [10]:
# Directly use AugmentedSocialScientist
from sklearn.model_selection import train_test_split
bert = Camembert()
train_df, test_df = train_test_split(df_with_emoji, test_size=0.2, random_state=42)
train_df.dropna(subset=['text'], inplace=True)
test_df.dropna(subset=['text'], inplace=True)

train_loader = bert.encode(train_df.text.values, train_df.abusetag2.values)
test_loader = bert.encode(test_df.text.values, test_df.abusetag2.values)

score = bert.run_training(train_loader,          #encoded training set
                          test_loader,           #encoded test set
                          n_epochs=3,            #number of epochs
                          lr=5e-5,               #learning rate
                          random_state=42)  

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


There are 1 GPU(s) available.
We will use GPU 0: NVIDIA GeForce GTX 970


  0%|          | 0/662 [00:00<?, ?it/s]

  0%|          | 0/662 [00:00<?, ?it/s]

label ids: {0: 0, 1: 1}


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/166 [00:00<?, ?it/s]

label ids: {0: 0, 1: 1}


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 0.66
  Training took: 0:00:54

Running Validation...

  Average test loss: 0.62
  Validation took: 0:00:04
              precision    recall  f1-score   support

           0       0.74      0.64      0.69        94
           1       0.60      0.71      0.65        72

    accuracy                           0.67       166
   macro avg       0.67      0.67      0.67       166
weighted avg       0.68      0.67      0.67       166


Training...

  Average training loss: 0.59
  Training took: 0:00:55

Running Validation...

  Average test loss: 0.59
  Validation took: 0:00:05
              precision    recall  f1-score   support

           0       0.80      0.65      0.72        94
           1       0.63      0.79      0.70        72

    accuracy                           0.71       166
   macro avg       0.72      0.72      0.71       166
weighted avg       0.73      0.71      0.71       166


Training...

  Average training loss: 0.52
  Training

# Method 1: Change emoji into text

In [10]:
# ! pip install emoji

In [11]:
# import emoji

In [12]:
# df = pd.DataFrame(emoji.EMOJI_DATA).T
# emoji_df = df['fr']
# emoji_df = emoji_df.apply(lambda x: ' '.join(k for k in x[1:-1].split('_')))
# dic_emoji = emoji_df.to_dict()

In [13]:
# def preprocessing_emoji(text):
#     doc = nlp(text)
#     tokens = [token.text.strip() for token in doc]
#     list = []
#     for k in tokens:
#         if k not in dic_emoji.keys():
#             list.append(k)
#         else:
#             list.append(dic_emoji[k])
#     return ' '.join(list)

In [14]:
# df_with_emoji_1 = df_with_emoji.copy()

In [15]:
# df_with_emoji_1.text = df_with_emoji_1.text.apply(lambda x: preprocessing_emoji(x))

In [16]:
# df_with_emoji_1.head(3)

In [17]:
# from sklearn.model_selection import train_test_split

In [18]:
# train_df, test_df = train_test_split(df_with_emoji_1, test_size=0.2, random_state=42)

In [19]:
# train_loader_emoji = bert.encode(train_df.text.values, train_df.abusetag2.values)
# test_loader_emoji = bert.encode(test_df.text.values, test_df.abusetag2.values)

## Directly use dataset with emojis to fine-tune Camembert

In [20]:
# core_2 = bert.run_training(train_loader_emoji,          #encoded training set
#                           test_loader_emoji,           #encoded test set
#                           n_epochs=3,            #number of epochs
#                           lr=5e-5,               #learning rate
#                           random_state=42,       #random state (for replicability)
#                           save_model_as='1_new_model')   #name of the saved model

Compared to the original model 'abuseapr'

In [21]:
# pred = bert.predict_with_model(test_loader_emoji, model_path='models/abuseapr')


In [22]:
# from sklearn.metrics import classification_report

# true_labels = test_df['abusetag2'].values
# pred_labels = np.argmax(pred, axis=1)

# report = classification_report(true_labels, pred_labels, target_names=['0', '1'])

# print(report)

=> Not better than the original one.

## Progressive fine-tuning (fine-tune 'abuseapr')

In [23]:
# bert.load_model('models/abuseapr')

In [24]:
# score = bert.run_training(train_loader_emoji,
#                           test_loader_emoji,
#                           n_epochs=3,
#                           lr=2e-5,
#                           random_state=42,
#                           save_model_as='models/abuseapr_with_emoji')

# Method 2: Get Emoji token-vectors

## Self-encoded

In [None]:
df_with_emoji_2 = df_with_emoji.copy()

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AdamW, get_linear_schedule_with_warmup, CamembertTokenizer, CamembertModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
camembert = CamembertModel.from_pretrained('camembert-base')

In [None]:
text_placeholder = '[TEXT]'
emoji_placeholder = '[EMOJI]'
tokenizer.add_tokens([text_placeholder, emoji_placeholder], special_tokens=True)
camembert.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 128

def preprocess(text, max_len=MAX_LEN):
    input_ids = []
    attention_mask = []
    emoji_ids = []

    for char in text:
        if char in tokenizer.get_vocab():
            token_id = tokenizer.convert_tokens_to_ids(char)
            input_ids.append(token_id)
            attention_mask.append(1)
            if char.startswith('▁') and len(char) == 1:
                emoji_ids.append(token_id)
        else:
            input_ids.append(tokenizer.convert_tokens_to_ids(text_placeholder))
            attention_mask.append(1)
            emoji_ids.append(tokenizer.convert_tokens_to_ids(emoji_placeholder))

    padding_length = max_len - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * padding_length
    attention_mask += [0] * padding_length
    emoji_ids += [tokenizer.pad_token_id] * padding_length

    return {
        'input_ids': input_ids[:max_len],
        'attention_mask': attention_mask[:max_len],
        'emoji_ids': emoji_ids[:max_len]
    }

class TextEmojiDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = preprocess(text)
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'emoji_ids': torch.tensor(inputs['emoji_ids'], dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    emoji_ids = [item['emoji_ids'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    emoji_ids = pad_sequence(emoji_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'emoji_ids': emoji_ids,
        'label': labels
    }

def create_data_loader(texts, labels, batch_size):
    ds = TextEmojiDataset(texts, labels)
    return DataLoader(ds, batch_size=batch_size, num_workers=0, shuffle=True, collate_fn=collate_fn)

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, num_emojis, hidden_size):
        super().__init__()
        self.camembert = camembert
        self.emoji_emb = nn.Embedding(len(tokenizer), hidden_size)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, input_ids, attention_mask, emoji_ids):
        outputs = self.camembert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        emoji_embeddings = self.emoji_emb(emoji_ids)
        emoji_embeddings = torch.mean(emoji_embeddings, dim=1)
        combined_embeddings = torch.cat([pooled_output, emoji_embeddings], dim=1)
        dropout_output = self.dropout(combined_embeddings)
        logits = self.out(dropout_output)
        return logits

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        emoji_ids = d['emoji_ids'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, emoji_ids=emoji_ids)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            emoji_ids = d['emoji_ids'].to(device)
            labels = d['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, emoji_ids=emoji_ids)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return correct_predictions.double() / n_examples, np.mean(losses), all_preds, all_labels

EPOCHS = 16
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

train_df, test_df = train_test_split(df_with_emoji_2, test_size=0.2, random_state=42)
train_texts = train_df['text'].values
train_labels = train_df['abusetag2'].values
val_texts = test_df['text'].values
val_labels = test_df['abusetag2'].values

train_data_loader = create_data_loader(train_texts, train_labels, BATCH_SIZE)
val_data_loader = create_data_loader(val_texts, val_labels, BATCH_SIZE)

model = SentimentClassifier(n_classes=2, num_emojis=50000, hidden_size=camembert.config.hidden_size)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_texts))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss, val_preds, val_labels = eval_model(model, val_data_loader, loss_fn, device, len(val_texts))
    print(f'Val loss {val_loss} accuracy {val_acc}')


print(classification_report(val_labels, val_preds))

## Pre-trained Embedding

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AdamW, get_linear_schedule_with_warmup, CamembertTokenizer, CamembertModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
camembert = CamembertModel.from_pretrained('camembert-base')

import gensim
e2v = gensim.models.KeyedVectors.load_word2vec_format("emoji2vec.bin", binary=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128

### Average

In [None]:
def preprocess(text, max_len=MAX_LEN):
    input_ids = []
    attention_mask = []
    emoji_embedding = get_emoji_embedding(text)

    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoded['input_ids'].squeeze()
    attention_mask = encoded['attention_mask'].squeeze()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'emoji_embedding': emoji_embedding
    }

def get_emoji_embedding(text):
    embeddings = []
    for char in text:
        if char in e2v:
            embeddings.append(e2v[char])
        else:
            embeddings.append(np.zeros(e2v.vector_size))
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(e2v.vector_size)

class TextEmojiDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = preprocess(text)
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'emoji_embedding': torch.tensor(inputs['emoji_embedding'], dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    emoji_embeddings = [item['emoji_embedding'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    emoji_embeddings = torch.stack(emoji_embeddings)
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'emoji_embeddings': emoji_embeddings,
        'label': labels
    }

def create_data_loader(texts, labels, batch_size):
    ds = TextEmojiDataset(texts, labels)
    return DataLoader(ds, batch_size=batch_size, num_workers=0, shuffle=True, collate_fn=collate_fn)

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, hidden_size, emoji_dim):
        super().__init__()
        self.camembert = camembert
        self.fc1 = nn.Linear(hidden_size, emoji_dim)
        self.fc2 = nn.Linear(emoji_dim * 2, n_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, emoji_embeddings):
        outputs = self.camembert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        text_features = self.fc1(pooled_output)
        combined_features = torch.cat((text_features, emoji_embeddings), dim=1)
        combined_features = self.dropout(combined_features)
        logits = self.fc2(combined_features)
        return logits

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        emoji_embeddings = d['emoji_embeddings'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, emoji_embeddings=emoji_embeddings)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            emoji_embeddings = d['emoji_embeddings'].to(device)
            labels = d['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, emoji_embeddings=emoji_embeddings)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return correct_predictions.double() / n_examples, np.mean(losses), all_preds, all_labels

EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5


train_df, test_df = train_test_split(df_with_emoji_2, test_size=0.2, random_state=42)
train_texts = train_df['text'].values
train_labels = train_df['label'].values
val_texts = test_df['text'].values
val_labels = test_df['label'].values

train_data_loader = create_data_loader(train_texts, train_labels, BATCH_SIZE)
val_data_loader = create_data_loader(val_texts, val_labels, BATCH_SIZE)

emoji_dim = e2v.vector_size
model = SentimentClassifier(n_classes=2, hidden_size=camembert.config.hidden_size, emoji_dim=emoji_dim)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_texts))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss, val_preds, val_labels = eval_model(model, val_data_loader, loss_fn, device, len(val_texts))
    print(f'Val loss {val_loss} accuracy {val_acc}')

print(classification_report(val_labels, val_preds))

### Position-Based

In [None]:
def preprocess(text, max_len=128):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoded['input_ids'].squeeze()
    attention_mask = encoded['attention_mask'].squeeze()

    with torch.no_grad():
        outputs = camembert(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_size)

    emoji_embeddings = []
    for token_id in input_ids.tolist():
        token = tokenizer.convert_ids_to_tokens(token_id)
        if token in e2v:
            emoji_embeddings.append(e2v[token])
        else:
            emoji_embeddings.append(np.zeros(e2v.vector_size))
    
    if len(emoji_embeddings) < max_len:
        emoji_embeddings.extend([np.zeros(e2v.vector_size)] * (max_len - len(emoji_embeddings)))
    emoji_embeddings = np.array(emoji_embeddings[:max_len])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_embeddings': token_embeddings,
        'emoji_embeddings': torch.tensor(emoji_embeddings, dtype=torch.float)
    }

class TextEmojiDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = preprocess(text)
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'token_embeddings': inputs['token_embeddings'],
            'emoji_embeddings': inputs['emoji_embeddings'],
            'label': torch.tensor(label, dtype=torch.long)
        }

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    token_embeddings = [item['token_embeddings'] for item in batch]
    emoji_embeddings = [item['emoji_embeddings'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    token_embeddings = pad_sequence(token_embeddings, batch_first=True)
    emoji_embeddings = torch.stack(emoji_embeddings)
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_embeddings': token_embeddings,
        'emoji_embeddings': emoji_embeddings,
        'label': labels
    }

def create_data_loader(texts, labels, batch_size):
    ds = TextEmojiDataset(texts, labels)
    return DataLoader(ds, batch_size=batch_size, num_workers=0, shuffle=True, collate_fn=collate_fn)

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, hidden_size, emoji_dim):
        super().__init__()
        self.fc_text = nn.Linear(hidden_size, hidden_size)
        self.fc_combined = nn.Linear(hidden_size + emoji_dim, n_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, token_embeddings, emoji_embeddings):
        sequence_output = self.fc_text(token_embeddings)  

        combined_features = torch.cat((sequence_output, emoji_embeddings), dim=2)  

        combined_features = combined_features.mean(dim=1) 
        combined_features = self.dropout(combined_features)
        logits = self.fc_combined(combined_features)
        return logits

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        token_embeddings = d['token_embeddings'].to(device)
        emoji_embeddings = d['emoji_embeddings'].to(device)
        labels = d['label'].to(device)

        outputs = model(token_embeddings=token_embeddings, emoji_embeddings=emoji_embeddings)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            token_embeddings = d['token_embeddings'].to(device)
            emoji_embeddings = d['emoji_embeddings'].to(device)
            labels = d['label'].to(device)

            outputs = model(token_embeddings=token_embeddings, emoji_embeddings=emoji_embeddings)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return correct_predictions.double() / n_examples, np.mean(losses), all_preds, all_labels

EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

train_df, test_df = train_test_split(df_with_emoji_2, test_size=0.2, random_state=42)
train_texts = train_df['text'].values
train_labels = train_df['label'].values
val_texts = test_df['text'].values
val_labels = test_df['label'].values

train_data_loader = create_data_loader(train_texts, train_labels, BATCH_SIZE)
val_data_loader = create_data_loader(val_texts, val_labels, BATCH_SIZE)

emoji_dim = e2v.vector_size
model = SentimentClassifier(n_classes=2, hidden_size=camembert.config.hidden_size, emoji_dim=emoji_dim)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_texts))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss, val_preds, val_labels = eval_model(model, val_data_loader, loss_fn, device, len(val_texts))
    print(f'Val loss {val_loss} accuracy {val_acc}')

print(classification_report(val_labels, val_preds))