<a href="https://colab.research.google.com/github/MatteoFasulo/Sexism-detection/blob/main/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import os
import requests
from pathlib import Path
import re
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from emoji import unicode_codes

from nltk.stem import WordNetLemmatizer

import gensim
import gensim.downloader as gloader

### Constants and Setup

In [2]:
URL_PATTERN_STR = r"""(?i)((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info
                      |int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|
                      bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|
                      cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|
                      gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|
                      la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|
                      nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|
                      sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|
                      uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]
                      *?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)
                      [a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name
                      |post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn
                      |bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg
                      |eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id
                      |ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|
                      md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|
                      ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|
                      sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|
                      za|zm|zw)\b/?(?!@)))"""
URL_PATTERN = re.compile(URL_PATTERN_STR, re.IGNORECASE)
HASHTAG_PATTERN = re.compile(r'#\w*')
MENTION_PATTERN = re.compile(r'@\w*')
RESERVED_WORDS_PATTERN = re.compile(r'\b(?<![@#])(RT|FAV)\b')
EMOJIS_PATTERN = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE)
SPECIAL_CHARACTERS_PATTERN = re.compile(r'&lt;/?[a-z]+&gt;')
DIGIT_PATTERN = re.compile(r'\d+')
AND_PATTERN = re.compile(r'&amp;')

# Task 1: Corpus

1. Download the data
2. Load the JSON files and encode them as a DataFrame
3. Generate hard labels for Task 1 with majority voting
4. Filter the DataFrame for only english tweets
5. Remove unwanted columns
6. Encode the hard labels column as integers

>**Bonus**: explore also Spanish tweets leveraging multi-language models and assessing the performance of the model on the two languages in comparison to the English-only model.

### Download the data

In [3]:
if not Path("data").exists():
    os.mkdir("data")
    print("data directory created")

train_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/training.json'
test_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/test.json'
val_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/validation.json'

In [4]:
train = requests.get(train_url)
test = requests.get(test_url)
val = requests.get(val_url)

with open('data/training.json', 'w') as f:
    f.write(train.text)

with open('data/test.json', 'w') as f:
    f.write(test.text)

with open('data/validation.json', 'w') as f:
    f.write(val.text)

### Load the JSON files and encode them as a DataFrame

In [5]:
train = pd.read_json("data/training.json", orient='index', encoding='utf-8')
val = pd.read_json("data/validation.json", orient='index', encoding='utf-8')
test = pd.read_json("data/test.json", orient='index', encoding='utf-8')

In [6]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,labels_task1,labels_task2,labels_task3,split
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES
100003,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES
100004,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES
100005,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES


In [7]:
train.shape, val.shape, test.shape

((6920, 11), (726, 11), (312, 11))

### Generate hard labels for Task 1 with majority voting

In [8]:
def majority_voting(votes: list[str]) -> str:
    total_num_votes = len(votes)
    yes_votes = votes.count("YES")
    no_votes = total_num_votes - yes_votes

    if yes_votes > no_votes:
        return "YES"
    elif no_votes > yes_votes:
        return "NO"
    else:
        return "NEUTRAL" # This will be the case when there is a tie (removed later)

In [9]:
train['hard_label_task1'] = train['labels_task1'].apply(majority_voting)
val['hard_label_task1'] = val['labels_task1'].apply(majority_voting)
test['hard_label_task1'] = test['labels_task1'].apply(majority_voting)

### Filter the DataFrame for only english tweets and remove unclear tweets

In [10]:
train = train[(train['hard_label_task1'] != "NEUTRAL") & (train['lang'] == "en")]
val = val[(val['hard_label_task1'] != "NEUTRAL") & (val['lang'] == "en")]
test = test[(test['hard_label_task1'] != "NEUTRAL") & (test['lang'] == "en")]

In [11]:
train.shape, val.shape, test.shape

((2870, 12), (158, 12), (286, 12))

### Remove unwanted columns

In [12]:
columns_to_maintain = ['id_EXIST', 'lang', 'tweet', 'hard_label_task1']

train = train[columns_to_maintain]
val = val[columns_to_maintain]
test = test[columns_to_maintain]

In [13]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,hard_label_task1
200002,200002,en,Writing a uni essay in my local pub with a cof...,YES
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,YES
200006,200006,en,According to a customer I have plenty of time ...,YES
200007,200007,en,"So only 'blokes' drink beer? Sorry, but if you...",YES
200008,200008,en,New to the shelves this week - looking forward...,NO


### Encode the hard labels column as integers

In [14]:
train['hard_label_task1'] = train['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
val['hard_label_task1'] = val['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
test['hard_label_task1'] = test['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)

In [15]:
train.hard_label_task1.value_counts()

hard_label_task1
0    1733
1    1137
Name: count, dtype: int64

# Task 2: Data Cleaning

1. Remove emojis
2. Remove hashtags (e.g. #metoo)
3. Remove mentions (e.g. @user)
4. Remove URLs
5. Remove special characters and symbols
6. Remove specific quote characters (e.g. curly quotes)
7. Perform lemmatization

>**Bonus**: use other preprocessing strategies exploring techniques tailored specifically for tweets or methods that are common in social media text processing.

The way to go (priority order) is the following:
1. Remove URLs
2. Remove mentions
3. Remove hashtags
4. Remove emojis
5. Remove special characters
6. Remove specific quote characters
7. Perform lemmatization

In [16]:
original_train = train.copy()

### Remove URLs

In [17]:
train['tweet'] = train['tweet'].apply(lambda x: URL_PATTERN.sub('', x))
val['tweet'] = val['tweet'].apply(lambda x: URL_PATTERN.sub('', x))
test['tweet'] = test['tweet'].apply(lambda x: URL_PATTERN.sub('', x))

### Remove mentions

In [18]:
train['tweet'] = train['tweet'].apply(lambda x: MENTION_PATTERN.sub('', x))
val['tweet'] = val['tweet'].apply(lambda x: MENTION_PATTERN.sub('', x))
test['tweet'] = test['tweet'].apply(lambda x: MENTION_PATTERN.sub('', x))

### Remove hashtags

In [19]:
train['tweet'] = train['tweet'].apply(lambda x: HASHTAG_PATTERN.sub('', x))
val['tweet'] = val['tweet'].apply(lambda x: HASHTAG_PATTERN.sub('', x))
test['tweet'] = test['tweet'].apply(lambda x: HASHTAG_PATTERN.sub('', x))

### Remove emojis

Leveraging the `emoji` library, it is possible to retrieve all the emojis codes. Then, we can remove them from the text very easily.
>**Note**: the `emoji` library has also a `replace_emoji()` function that can be used to replace the emojis with a specific string. However, for the purpose of this task, we will remove them iterating over the text instead of using the `replace_emoji()` function.

In [20]:
def remove_emoji(text: str):
    emojis = unicode_codes.EMOJI_DATA
    result = text
    for emoji in emojis:
        result = result.replace(emoji, "")
    return result

In [21]:
train['tweet'] = train['tweet'].apply(remove_emoji)
val['tweet'] = val['tweet'].apply(remove_emoji)
test['tweet'] = test['tweet'].apply(remove_emoji)

### Remove special characters

In [22]:
train['tweet'] = train['tweet'].apply(lambda x: SPECIAL_CHARACTERS_PATTERN.sub(' ', x))
val['tweet'] = val['tweet'].apply(lambda x: SPECIAL_CHARACTERS_PATTERN.sub(' ', x))
test['tweet'] = test['tweet'].apply(lambda x: SPECIAL_CHARACTERS_PATTERN.sub(' ', x))

train['tweet'] = train['tweet'].apply(lambda x: AND_PATTERN.sub('and', x))
val['tweet'] = val['tweet'].apply(lambda x: AND_PATTERN.sub('and', x))
test['tweet'] = test['tweet'].apply(lambda x: AND_PATTERN.sub('and', x))

train['tweet'] = train['tweet'].str.strip()
val['tweet'] = val['tweet'].str.strip()
test['tweet'] = test['tweet'].str.strip()

# Leave only words, remove any other special character, symbol or specific quote character
train['tweet'] = train['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
val['tweet'] = val['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
test['tweet'] = test['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))

train['tweet'] = train['tweet'].str.strip()
val['tweet'] = val['tweet'].str.strip()
test['tweet'] = test['tweet'].str.strip()

### Remove specific quote characters

### Perform lemmatization

In [23]:
lemmatizer = WordNetLemmatizer()

In [24]:
train['tweet'] = train['tweet'].apply(lambda x: lemmatizer.lemmatize(x))
val['tweet'] = val['tweet'].apply(lambda x: lemmatizer.lemmatize(x))
test['tweet'] = test['tweet'].apply(lambda x: lemmatizer.lemmatize(x))

### Put everything lowercase

In [25]:
train['tweet'] = train['tweet'].apply(lambda x: x.lower())
val['tweet'] = val['tweet'].apply(lambda x: x.lower())
test['tweet'] = test['tweet'].apply(lambda x: x.lower())

In [47]:
def show_diff(random: bool = True):
    if random:
        idx = np.random.randint(0, train.shape[0])
    else:
        idx = 0

    print(f"Original tweet:\n{original_train['tweet'].iloc[idx]}")
    print(f"Processed tweet:\n{train['tweet'].iloc[idx]}")
    
show_diff()

Original tweet:
@BernieSanders Its also quite common in agri pastoral societies for mothers to be back working on the fields 72 hrs - 1 week after giving birth.What America has is too many pressure groups talking down to the most vulnerable (like, mothers, children, teens) in a bid to tell them what to do.
Processed tweet:
its also quite common in agri pastoral societies for mothers to be back working on the fields    hrs     week after giving birth what america has is too many pressure groups talking down to the most vulnerable  like  mothers  children  teens  in a bid to tell them what to do


# Task 3: Text Encoding

* Embed words using GloVe embeddings.

### Embed words using GloVe embeddings

In [48]:
dim = 50
model = 'glove-twitter'
emb_model = gloader.load(f"{model}-{dim}")
unk_embedding = np.zeros(dim)

In [49]:
from typing import OrderedDict


def get_vocab(data: pd.DataFrame) -> OrderedDict:
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()

    curr_idx = 0
    for sentence in data['tweet'].values:
        tokens = sentence.split()
        for token in tokens:
            if token not in word_to_idx:
                word_to_idx[token] = curr_idx
                idx_to_word[curr_idx] = token
                curr_idx += 1

    return idx_to_word, word_to_idx

train_idx_to_word, train_word_to_idx = get_vocab(train)
train_word_listing = list(train_idx_to_word.values())

In [50]:
print("Number of OOV words in the training set:", len(set(train_word_listing).difference(set(emb_model.key_to_index.keys()))))

Number of OOV words in the training set: 815


In [51]:
set(train_word_listing).difference(set(emb_model.key_to_index.keys()))

{'acephalous',
 'actionaid',
 'actium',
 'activismby',
 'admonitions',
 'adorime',
 'aeneid',
 'afterpay',
 'againmaybe',
 'agreementsoctober',
 'ahhhhhh',
 'ainihin',
 'aislated',
 'akaabir',
 'akaabireen',
 'akdhajhdajjahshh',
 'allegationsjudge',
 'allprobably',
 'allvotebluein',
 'allwhere',
 'allyship',
 'alreadyeither',
 'alresdy',
 'alterlife',
 'altseason',
 'amadour',
 'amadudu',
 'ambanii',
 'ammm',
 'amogus',
 'amrylin',
 'andbred',
 'anddodge',
 'andhkaar',
 'andlight',
 'andmayb',
 'androcentrism',
 'andsome',
 'anglicised',
 'angries',
 'angrywhen',
 'angryyou',
 'annulments',
 'antivax',
 'applicationbut',
 'appropriative',
 'arbella',
 'archipi',
 'arkady',
 'arklight',
 'asapits',
 'assignations',
 'assignmentsonline',
 'assuages',
 'astrodomina',
 'asymmetries',
 'aucklanders',
 'authoritarians',
 'autogynephiliait',
 'availiaple',
 'avunlocal',
 'awashilo',
 'axiomatizes',
 'aytl',
 'bacigalupi',
 'baddiel',
 'bakubussy',
 'bandied',
 'bannisters',
 'barelkowski',
 '

In [None]:
# For a given token if it is in the training set we add that to the vocabulary and assign the corresponding embedding using GloVe or Custom embeddings. If, instead, it is in the validation or test set we assign the special token [UNK] (if not already present in the vocabulary) and assign a custom embedding

#all_tokens = set(train_word_listing + val_word_listing + test_word_listing)
embedding_vocabulary = set(emb_model.key_to_index.keys()) # GloVe Vocabulary

new_tokens = []
new_vectors = []
for token in train_word_listing:
    # Add to the vocabulary if not already present
    if token not in embedding_vocabulary:
        #print(f"Token {token} not in GloVe vocabulary") # TODO: check data processing
        embedding_vocabulary.add(token)
    try:
        embedding_vector = emb_model.get_vector(token)
    except (KeyError, ValueError):
        embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=dim)

    new_tokens.append(token)
    new_vectors.append(embedding_vector)

emb_model.add_vectors(new_tokens, new_vectors)

In [53]:
def get_vocab_val_test(data, train_word_listing):
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()

    unk_token = "[UNK]"
    curr_idx = 1

    word_to_idx[unk_token] = 0
    idx_to_word[0] = unk_token

    for sentence in data['tweet'].values:
        tokens = sentence.split()
        for token in tokens:
            if token in train_word_listing:
                word_to_idx[token] = curr_idx
                idx_to_word[curr_idx] = token
                curr_idx += 1

    return idx_to_word, word_to_idx

val_idx_to_word, val_word_to_idx = get_vocab_val_test(val, train_word_listing)
test_idx_to_word, test_word_to_idx = get_vocab_val_test(test, train_word_listing)

In [54]:
emb_model.add_vector("[UNK]", unk_embedding)



1194329

In [55]:
### Create the embeddings matrix
embedding_matrix = emb_model.vectors

In [None]:
import umap

def visualize_embeddings(embeddings: np.ndarray,
                         word_annotations,
                         word_to_idx):
    """
    Plots given reduce word embeddings (2D). Users can highlight specific words (word_annotations list).

    :param embeddings: word embedding matrix of shape (words, 2) retrieved via a dimensionality reduction technique.
    :param word_annotations: list of words to be annotated.
    :param word_to_idx: vocabulary map (word -> index) (dict)
    """
    fig, ax = plt.subplots(1, 1, figsize=(15, 12))

    if word_annotations:
        print(f"Annotating words: {word_annotations}")

        word_indexes = []
        for word in word_annotations:
            word_index = word_to_idx[word]
            word_indexes.append(word_index)

        word_indexes = np.array(word_indexes)

        other_embeddings = embeddings[np.setdiff1d(np.arange(embeddings.shape[0]), word_indexes)]
        target_embeddings = embeddings[word_indexes]

        ax.scatter(other_embeddings[:, 0], other_embeddings[:, 1], alpha=0.1, c='blue')
        ax.scatter(target_embeddings[:, 0], target_embeddings[:, 1], alpha=1.0, c='red')
        ax.scatter(target_embeddings[:, 0], target_embeddings[:, 1], alpha=1, facecolors='none', edgecolors='r', s=1000)

        for word, word_index in zip(word_annotations, word_indexes):
            word_x, word_y = embeddings[word_index, 0], embeddings[word_index, 1]
            ax.annotate(word, xy=(word_x, word_y))
    else:
        ax.scatter(embeddings[:, 0], embeddings[:, 1], alpha=0.1, c='blue')

    # We avoid outliers ruining the visualization if they are quite far away
    axis_x_limit = (np.min(embeddings[:, 0]), np.max(embeddings[:, 0]))
    axis_y_limit = (np.min(embeddings[:, 1]), np.max(embeddings[:, 1]))
    plt.xlim(left=axis_x_limit[0] - 0.5, right=axis_x_limit[1] + 0.5)
    plt.ylim(bottom=axis_y_limit[0] - 0.5, top=axis_y_limit[1] + 0.5)
    ax.set_xlim(axis_x_limit[0], axis_x_limit[1])
    ax.set_ylim(axis_y_limit[0], axis_y_limit[1])

In [None]:
def reduce_SVD(embeddings: np.ndarray) -> np.ndarray:
    """
    Applies SVD dimensionality reduction.

    :param embeddings: word embedding matrix of shape (words, dim). In the case
                       of a word-word co-occurrence matrix the matrix shape would
                       be (words, words).

    :return
        - 2-dimensional word embedding matrix of shape (words, 2)
    """
    print("Running SVD reduction method...")
    svd = TruncatedSVD(n_components=2, n_iter=10)
    reduced = svd.fit_transform(embeddings)
    print("SVD reduction completed!")

    return reduced

def reduce_tSNE(embeddings: np.ndarray) -> np.ndarray:
    """
    Applies t-SNE dimensionality reduction.
    """
    print("Running t-SNE reduction method... (it may take a while...)")
    tsne = TSNE(n_components=2, n_iter=1000, metric='cosine')
    reduced = tsne.fit_transform(embeddings)
    print("t-SNE reduction completed!")

    return reduced

def reduce_umap(embeddings: np.ndarray) -> np.ndarray:
    """
    Applies UMAP dimensionality reduction.
    """
    print("Running UMAP reduction method... (it may take a while...)")
    umap_emb = umap.UMAP(n_components=2, metric='cosine')
    reduced = umap_emb.fit_transform(embeddings)
    print("UMAP reduction completed!")
    
    return reduced

In [None]:
# UMAP
reduced_embedding_umap = reduce_umap(embedding_matrix)
visualize_embeddings(reduced_embedding_umap, ['whore', 'woman', 'slut', 'girl', 'man', 'boy'], emb_model.key_to_index)

plt.show()

# Task 4: Model definition

# Task 5: Training and Evaluation

# Task 6: Transformers

# Task 7: Error Analysis

# Task 8: Report