<a href="https://colab.research.google.com/github/MatteoFasulo/Sexism-detection/blob/main/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [2]:
import requests
from pathlib import Path
import re
import json
from typing import OrderedDict

import numpy as np
import pandas as pd

import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer

import gensim
import gensim.downloader as gloader

import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
tf

# Class definition

In [3]:
class SexismDetector:
    def __init__(self):
        
        URL_PATTERN_STR = r"""(?i)((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info
                      |int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|
                      bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|
                      cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|
                      gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|
                      la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|
                      nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|
                      sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|
                      uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]
                      *?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)
                      [a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name
                      |post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn
                      |bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg
                      |eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id
                      |ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|
                      md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|
                      ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|
                      sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|
                      za|zm|zw)\b/?(?!@)))"""
        self.URL_PATTERN = re.compile(URL_PATTERN_STR, re.IGNORECASE)
        self.HASHTAG_PATTERN = re.compile(r'#\w*')
        self.MENTION_PATTERN = re.compile(r'@\w*')
        self.EMOJIS_PATTERN = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
        self.SPECIAL_CHARACTERS_PATTERN = re.compile(r'&lt;/?[a-z]+&gt;')
        self.AND_PATTERN = re.compile(r'&amp;')
        self.WORD_PATTERN = re.compile(r'[^a-zA-Z\s]')
        self.SEED = 42
        self.DATA_FOLDER = Path('data')
        self.columns_to_maintain = ['id_EXIST', 'lang', 'tweet', 'hard_label_task1']
        self.UNK_TOKEN = '[UNK]'
        self.PAD_TOKEN = '[PAD]'

    def download_corpus(self, url: str, filename: str):
        if not self.DATA_FOLDER.exists():
            self.DATA_FOLDER.mkdir(parents=True)
            print(f"Created folder {self.DATA_FOLDER}.")
            
        response = requests.get(url)
        response.raise_for_status()
        with open(self.DATA_FOLDER / filename, 'w', encoding='utf-8') as f:
            f.write(response.text)

    def load_corpus(self, filename: str, *args, **kwargs):
        return pd.read_json(self.DATA_FOLDER / filename, *args, **kwargs)

    @staticmethod
    def majority_voting(votes: list[str]) -> str:
        total_num_votes = len(votes)
        yes_votes = votes.count("YES")
        no_votes = total_num_votes - yes_votes

        if yes_votes > no_votes:
            return "YES"
        elif no_votes > yes_votes:
            return "NO"
        else:
            return "NEUTRAL" # This will be the case when there is a tie (removed later)

    def preprocess_text(self, text: str) -> str:
        text = self.URL_PATTERN.sub('', text)
        text = self.MENTION_PATTERN.sub('', text)
        text = self.HASHTAG_PATTERN.sub('', text)
        text = self.EMOJIS_PATTERN.sub('', text)
        text = self.SPECIAL_CHARACTERS_PATTERN.sub('', text)
        text = self.AND_PATTERN.sub('and', text)
        text = text.strip()
        text = self.WORD_PATTERN.sub(' ', text)
        text = text.strip()
        return text

    def lemmatize_text(self, text: str) -> str:
        lemmatizer = WordNetLemmatizer()
        downloaded = False
        while not downloaded:
            try:
                lemmatizer.lemmatize(text)
                downloaded = True
            except LookupError:
                print("Downloading WordNet...")
                nltk.download('wordnet')
        return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    @staticmethod
    def text_diff(original_text: str, preprocessed_text: str, random: bool = True):
        if random:
            idx = np.random.randint(0, preprocessed_text.shape[0])
        else:
            idx = 0

        print(f"Original tweet:\n{original_text['tweet'].iloc[idx]}")
        print(f"Processed tweet:\n{preprocessed_text['tweet'].iloc[idx]}")

    def load_glove(self, model_name: str = 'glove-wiki-gigaword', embedding_dim: int = 50):
        self.EMBEDDING_DIM = embedding_dim
        return gloader.load(f"{model_name}-{embedding_dim}")

    def get_vocab(self, data: pd.DataFrame, word_listing: list = None) -> OrderedDict:
        idx_to_word = OrderedDict()
        word_to_idx = OrderedDict()

        tokenizer = nltk.tokenize.NLTKWordTokenizer()

        if word_listing is None:
            curr_idx = 0
            for sentence in data['tweet'].values:
                tokens = tokenizer.tokenize(sentence)
                for token in tokens:
                    if token not in word_to_idx:
                        word_to_idx[token] = curr_idx
                        idx_to_word[curr_idx] = token
                        curr_idx += 1

        else:
            word_to_idx[self.UNK_TOKEN] = 0
            idx_to_word[0] = self.UNK_TOKEN

            curr_idx = 1
            for sentence in data['tweet'].values:
                tokens = sentence.split()
                for token in word_listing:
                    if token not in word_to_idx:
                        word_to_idx[token] = curr_idx
                        idx_to_word[curr_idx] = token
                        curr_idx += 1
        
        return idx_to_word, word_to_idx

    def get_augmented_vocab(self, emb_model: gensim.models.keyedvectors.KeyedVectors, train_words: list, save: bool = False) -> gensim.models.keyedvectors.KeyedVectors:
        embedding_vocab = set(emb_model.key_to_index.keys())

        new_tokens = []
        new_vectors = []
        for token in train_words:
            if token not in embedding_vocab:
                embedding_vocab.add(token)
            try:
                embedding_vec = emb_model.get_vector(token)
            except (KeyError, ValueError):
                embedding_vec = np.random.uniform(low=-0.05, high=0.05, size=self.EMBEDDING_DIM)

            new_tokens.append(token)
            new_vectors.append(embedding_vec)

        emb_model.add_vectors(new_tokens, new_vectors)

        if save:
            vocab_path = self.DATA_FOLDER / 'vocab2.json'
            print(f"Saving vocab to {vocab_path}")
            with vocab_path.open('w', encoding='utf-8') as f:
                json.dump(emb_model.key_to_index, f, indent=4)
            print("Vocab saved!")

        return emb_model

    def get_oov_stats(self, embedding_model: gensim.models.keyedvectors.KeyedVectors, word_listing: list) -> None:
        OOV_count = set(word_listing).difference(set(embedding_model.key_to_index.keys()))
        OOV_percentage = float(len(OOV_count)) * 100 / len(word_listing)

        print(f"Total OOV terms: {len(OOV_count)} ({OOV_percentage:.2f}%)")

    @staticmethod
    def visualize_embeddings(embeddings: np.ndarray,
                         word_annotations,
                         word_to_idx):
        """
        Plots given reduce word embeddings (2D). Users can highlight specific words (word_annotations list).

        :param embeddings: word embedding matrix of shape (words, 2) retrieved via a dimensionality reduction technique.
        :param word_annotations: list of words to be annotated.
        :param word_to_idx: vocabulary map (word -> index) (dict)
        """
        fig, ax = plt.subplots(1, 1, figsize=(15, 12))

        if word_annotations:
            print(f"Annotating words: {word_annotations}")

            word_indexes = []
            for word in word_annotations:
                word_index = word_to_idx[word]
                word_indexes.append(word_index)

            word_indexes = np.array(word_indexes)

            other_embeddings = embeddings[np.setdiff1d(np.arange(embeddings.shape[0]), word_indexes)]
            target_embeddings = embeddings[word_indexes]

            ax.scatter(other_embeddings[:, 0], other_embeddings[:, 1], alpha=0.1, c='blue')
            ax.scatter(target_embeddings[:, 0], target_embeddings[:, 1], alpha=1.0, c='red')
            ax.scatter(target_embeddings[:, 0], target_embeddings[:, 1], alpha=1, facecolors='none', edgecolors='r', s=1000)

            for word, word_index in zip(word_annotations, word_indexes):
                word_x, word_y = embeddings[word_index, 0], embeddings[word_index, 1]
                ax.annotate(word, xy=(word_x, word_y))
        else:
            ax.scatter(embeddings[:, 0], embeddings[:, 1], alpha=0.1, c='blue')

        # We avoid outliers ruining the visualization if they are quite far away
        axis_x_limit = (np.min(embeddings[:, 0]), np.max(embeddings[:, 0]))
        axis_y_limit = (np.min(embeddings[:, 1]), np.max(embeddings[:, 1]))
        plt.xlim(left=axis_x_limit[0] - 0.5, right=axis_x_limit[1] + 0.5)
        plt.ylim(bottom=axis_y_limit[0] - 0.5, top=axis_y_limit[1] + 0.5)
        ax.set_xlim(axis_x_limit[0], axis_x_limit[1])
        ax.set_ylim(axis_y_limit[0], axis_y_limit[1])

    @staticmethod
    def reduce_SVD(embeddings: np.ndarray) -> np.ndarray:
        """
        Applies SVD dimensionality reduction.

        :param embeddings: word embedding matrix of shape (words, dim). In the case
                        of a word-word co-occurrence matrix the matrix shape would
                        be (words, words).

        :return
            - 2-dimensional word embedding matrix of shape (words, 2)
        """
        print("Running SVD reduction method...")
        svd = TruncatedSVD(n_components=2, n_iter=10)
        reduced = svd.fit_transform(embeddings)
        print("SVD reduction completed!")

        return reduced

    @staticmethod
    def reduce_tSNE(embeddings: np.ndarray) -> np.ndarray:
        """
        Applies t-SNE dimensionality reduction.
        """
        print("Running t-SNE reduction method... (it may take a while...)")
        tsne = TSNE(n_components=2, n_iter=1000, metric='cosine')
        reduced = tsne.fit_transform(embeddings)
        print("t-SNE reduction completed!")

        return reduced

    @staticmethod
    def reduce_umap(embeddings: np.ndarray) -> np.ndarray:
        """
        Applies UMAP dimensionality reduction.
        """
        print("Running UMAP reduction method... (it may take a while...)")
        umap_emb = umap.UMAP(n_components=2, metric='cosine')
        reduced = umap_emb.fit_transform(embeddings)
        print("UMAP reduction completed!")
        
        return reduced
    

# Task 1: Corpus

1. Download the data
2. Load the JSON files and encode them as a DataFrame
3. Generate hard labels for Task 1 with majority voting
4. Filter the DataFrame for only english tweets
5. Remove unwanted columns
6. Encode the hard labels column as integers

>**Bonus**: explore also Spanish tweets leveraging multi-language models and assessing the performance of the model on the two languages in comparison to the English-only model.

In [4]:
detector = SexismDetector()

### Download the data

In [5]:
detector.download_corpus(url='https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/training.json', filename='training.json')
detector.download_corpus(url='https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/test.json', filename='test.json')
detector.download_corpus(url='https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/validation.json', filename='validation.json')

### Load the JSON files and encode them as a DataFrame

In [6]:
train = detector.load_corpus('training.json', orient='index', encoding='utf-8')
test = detector.load_corpus('test.json', orient='index', encoding='utf-8')
val = detector.load_corpus('validation.json', orient='index', encoding='utf-8')

In [7]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,labels_task1,labels_task2,labels_task3,split
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES
100003,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES
100004,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES
100005,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES


In [8]:
train.shape, val.shape, test.shape

((6920, 11), (726, 11), (312, 11))

### Generate hard labels for Task 1 with majority voting

In [9]:
train['hard_label_task1'] = train['labels_task1'].apply(detector.majority_voting)
val['hard_label_task1'] = val['labels_task1'].apply(detector.majority_voting)
test['hard_label_task1'] = test['labels_task1'].apply(detector.majority_voting)

### Filter the DataFrame for only english tweets and remove unclear tweets

In [10]:
train = train[(train['hard_label_task1'] != "NEUTRAL") & (train['lang'] == "en")]
val = val[(val['hard_label_task1'] != "NEUTRAL") & (val['lang'] == "en")]
test = test[(test['hard_label_task1'] != "NEUTRAL") & (test['lang'] == "en")]

In [11]:
train.shape, val.shape, test.shape

((2870, 12), (158, 12), (286, 12))

### Remove unwanted columns

In [12]:
train = train[detector.columns_to_maintain]
val = val[detector.columns_to_maintain]
test = test[detector.columns_to_maintain]

In [13]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,hard_label_task1
200002,200002,en,Writing a uni essay in my local pub with a cof...,YES
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,YES
200006,200006,en,According to a customer I have plenty of time ...,YES
200007,200007,en,"So only 'blokes' drink beer? Sorry, but if you...",YES
200008,200008,en,New to the shelves this week - looking forward...,NO


### Encode the hard labels column as integers

In [14]:
train['hard_label_task1'] = train['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
val['hard_label_task1'] = val['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
test['hard_label_task1'] = test['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)

In [15]:
train.hard_label_task1.value_counts()

hard_label_task1
0    1733
1    1137
Name: count, dtype: int64

# Task 2: Data Cleaning

1. Remove emojis
2. Remove hashtags (e.g. #metoo)
3. Remove mentions (e.g. @user)
4. Remove URLs
5. Remove special characters and symbols
6. Remove specific quote characters (e.g. curly quotes)
7. Perform lemmatization

>**Bonus**: use other preprocessing strategies exploring techniques tailored specifically for tweets or methods that are common in social media text processing.

The way to go (priority order) is the following:
1. Remove URLs
2. Remove mentions
3. Remove hashtags
4. Remove emojis
5. Remove special characters
6. Remove specific quote characters
7. Perform lemmatization

In [16]:
original_train = train.copy()

In [17]:
train['tweet'] = train['tweet'].apply(detector.preprocess_text)
val['tweet'] = val['tweet'].apply(detector.preprocess_text)
test['tweet'] = test['tweet'].apply(detector.preprocess_text)

### Perform lemmatization

In [18]:
train['tweet'] = train['tweet'].apply(detector.lemmatize_text)
val['tweet'] = val['tweet'].apply(detector.lemmatize_text)
test['tweet'] = test['tweet'].apply(detector.lemmatize_text)

### Cast text to lowercase

In [19]:
train['tweet'] = train['tweet'].str.lower()
val['tweet'] = val['tweet'].str.lower()
test['tweet'] = test['tweet'].str.lower()

### Show the difference between the original and cleaned text

In [20]:
detector.text_diff(preprocessed_text=train, original_text=original_train, random=True)

Original tweet:
@bobbybenitez81 In what sense? Elders wives are mostly busybodies who're constantly meddling in peoples business but women have no clout to infiltrate the boys club. A woman's silence is having her rights removed as it relates to taking the lead and speaking in a teaching capacity in the KHs
Processed tweet:
in what sense elders wife are mostly busybody who re constantly meddling in people business but woman have no clout to infiltrate the boy club a woman s silence is having her right removed a it relates to taking the lead and speaking in a teaching capacity in the khs


# Task 3: Text Encoding

* Embed words using GloVe embeddings.

### Embed words using GloVe embeddings

In [21]:
emb_model = detector.load_glove(model_name='glove-wiki-gigaword', embedding_dim=50)
len(emb_model.key_to_index.keys())

400000

In [22]:
train_idx_to_word, train_word_to_idx = detector.get_vocab(train)
train_word_listing = list(train_idx_to_word.values())

In [23]:
detector.get_oov_stats(emb_model, train_word_listing)

Total OOV terms: 868 (8.81%)


In [24]:
emb_model_augmented = detector.get_augmented_vocab(emb_model, train_words=train_word_listing, save=True)

Saving vocab to data\vocab2.json
Vocab saved!


In [25]:
val_idx_to_word, val_word_to_idx = detector.get_vocab(val, word_listing=train_word_listing)
test_idx_to_word, test_word_to_idx = detector.get_vocab(test, word_listing=train_word_listing)

val_word_listing = list(val_idx_to_word.values())
test_word_listing = list(test_idx_to_word.values())

In [26]:
# add the UNK token to the embedding model with the vector which is the average of all the vectors
emb_model_augmented.add_vectors(["[UNK]", "[PAD]"], [np.mean(emb_model.vectors, axis=0), np.zeros(50)])

In [27]:
detector.get_oov_stats(emb_model_augmented, val_word_listing)
detector.get_oov_stats(emb_model_augmented, test_word_listing)

Total OOV terms: 0 (0.00%)
Total OOV terms: 0 (0.00%)


In [28]:
### Create the embeddings matrix
embedding_matrix = emb_model.vectors
embedding_matrix.shape

(400870, 50)

In [29]:
# UMAP
#reduced_embedding_umap = detector.reduce_umap(embedding_matrix)
#detector.visualize_embeddings(reduced_embedding_umap, ['whore', 'woman', 'slut', 'girl', 'man', 'boy'], emb_model.key_to_index)

#plt.show()

# Task 4: Model definition

* Baseline: Implement a Bidirectional LSTM with a Dense layer on top.
* Model 1: add an additional LSTM layer to the baseline model.

### Baseline: Implement a Bidirectional LSTM with a Dense layer on top

In [70]:
def prepare_sequence(sequences: list, embedding_model):
    # instantiate the tokenizer
    tokenizer = nltk.NLTKWordTokenizer()
    # tokenize each sequence
    tokenized_seqs = [tokenizer.tokenize(seq) for seq in sequences]
    # convert them to numerical form
    numerical_seqs = ([torch.tensor([embedding_model.get_index(token) for token in seq]) for seq in tokenized_seqs])
    # pad the sequences and cast them to tensor
    return torch.nn.utils.rnn.pad_sequence(sequences=numerical_seqs, batch_first=True, padding_value=embedding_model.get_index(detector.PAD_TOKEN))

In [71]:
prepare_sequence(sequences=train['tweet'], embedding_model=emb_model_augmented)

tensor([[  1649,      7,  36352,  ..., 400869, 400869, 400869],
        [    20,     14,     36,  ..., 400869, 400869, 400869],
        [   200,      4,      7,  ..., 400869, 400869, 400869],
        ...,
        [   190,    285,    411,  ..., 400869, 400869, 400869],
        [   738,     88,     81,  ..., 400869, 400869, 400869],
        [ 23088,     61,  27487,  ..., 400869, 400869, 400869]])

In [101]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame, embedding_model):
        self.data = data
        self.embedding_model = embedding_model
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        tweet_text = self.data['tweet'].iloc[index]
        encoded_tweet = prepare_sequence(sequences=[tweet_text], embedding_model=self.embedding_model)
        return encoded_tweet, self.data['hard_label_task1'].iloc[index]

In [102]:
data = CustomDataset(data=train, embedding_model=emb_model_augmented)
train_dataloader = torch.utils.data.DataLoader(data, shuffle=True)

In [105]:
next(iter(train_dataloader))[0].shape, next(iter(train_dataloader))[1].shape

(torch.Size([1, 1, 50]), torch.Size([1]))

In [106]:
#train_dataloader = torch.utils.data.DataLoader(prepare_sequence(sequences=train['tweet'], embedding_model=emb_model_augmented), batch_size=32, shuffle=True)

In [120]:
class BaselineLSTM(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(BaselineLSTM, self).__init__()

        self.word_embeddings = torch.nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix), freeze=False)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.hidden2tag = torch.nn.Linear(hidden_dim *2, output_dim)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = torch.sigmoid(tag_space)
        return tag_scores

model = BaselineLSTM(embedding_dim=50, hidden_dim=256, output_dim=1)
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)

for epoch in range(20):
    for batch in train_dataloader:
        sentences, labels = batch

        model.zero_grad()
        tag_scores = model(sentences.squeeze(1))
        tag_scores = tag_scores.mean(dim=1)  # Take the mean along the sequence dimension
        loss = loss_function(tag_scores.squeeze(), labels.float().squeeze())
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

KeyboardInterrupt: 

# Task 5: Training and Evaluation

# Task 6: Transformers

# Task 7: Error Analysis

# Task 8: Report