In [None]:
# !pip install gensim -q

In [None]:
import string

import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary

import gensim
from IPython.display import clear_output

import torchvision
from torchvision import transforms

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import gc
import os

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
MODE = "local" # valid modes are 'local', 'colab' and 'kaggle'
SAMPLE_NUMBERS = 4

In [None]:
if   MODE == "local":
    BASE_PATH = "."
elif MODE == "colab":
    BASE_PATH = "/content/drive/MyDrive/Deep Neural Network - UT/CA 04"
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print(f"Unknown mode {MODE}")
    exit(1)

In [None]:
DATA_PATH = os.path.join(BASE_PATH, "twitter-suicidal_data.csv")

W2V_PATH = os.path.join(BASE_PATH, "w2v_vectors.kv")


# Data Exploration

## Load Data

In [None]:
df = pd.read_csv(DATA_PATH)

## Data statistics

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
print(f"Number of samples: {len(df)}")

In [None]:
intentions = df["intention"].value_counts()

plt.figure(figsize=(7, 5))
sns.barplot(x=intentions.index, y=intentions.values)
plt.title("Intentions Distribution")
plt.show()

# Data Preprocessing

In [None]:
import nltk
import emoji
import re

nltk.download(["stopwords", "punkt", "wordnet"])

In [None]:
def convert_emoji_to_text(text):
    return emoji.demojize(text, delimiters=(" ", " "))

stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = nltk.stem.WordNetLemmatizer()

def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return 'a'
    elif nltk_tag.startswith('V'):
        return 'v'
    elif nltk_tag.startswith('N'):
        return 'n'
    elif nltk_tag.startswith('R'):
        return 'r'
    else:          
        return 'n'

def preprocess_data(text: str):
    """
    Preprocessing steps are as follows:
    0. concatenation of the text (not necessary)
    1. lowercase the text
    2. remove punctuation
    3. remove numbers
    4. remove urls
    5. remove usernames
    6. remove extra spaces
    7. convert emojis to text
    8. remove non-word characters
    9. lemmatization and tokenization of the text
    10. remove stopwords
    :param text: str
    :return: tokens: list[str]
    """
    # text = ''.join(text)
    # lowercase the text
    text = text.lower()
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove numbers
    text = re.sub(r"\d+", "", text)
    # remove urls,
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # remove usernames
    text = re.sub(r'@\S+', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # convert emojis to text
    text = convert_emoji_to_text(text)
    # remove non-word characters
    text = re.sub(r"[^\w\s]", "", text)
    # lemmatization and tokenization of the text
    tokenized = nltk.word_tokenize(text)
    pos_tag = nltk.pos_tag(tokenized)
    tokens = [
        lemmatizer.lemmatize(token, nltk_pos_tagger(tag))
        for token, tag in pos_tag
    ]
    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords]
    
    return tokens
    

In [None]:
def format_length_bound(text: str, length:int):
    if len(text) <= length:
        return text
    mm = length // 2 - 3
    rr = length - mm - 5
    return text[:mm] + "|...|" + text[-rr:]
    
def draw_sample_processing(dataframe: pd.DataFrame, sample_numbers: int = SAMPLE_NUMBERS, show=True):
    sample_text = dataframe.sample(sample_numbers)
    label_rawT_procT = [
        (label, tweet, preprocess_data(tweet)) 
        for tweet, label in zip(sample_text["tweet"], sample_text["intention"])
    ]
    if show:
        for idx, (label, rawT, procT) in enumerate(label_rawT_procT, start=1):
            procT = str(procT)
            print(f"Sample {idx}:")
            print(f":::::::Label          : {label}")
            print(f":::::::Raw Tweet      : {format_length_bound(rawT, 50):<50} ({len(rawT)})")
            print(f":::::::Processed Tweet: {format_length_bound(procT, 50):<50} ({len(procT)})")
        return 
    return pd.DataFrame(label_rawT_procT, columns=["Intention", "Raw Tweet", "Processed Tweet"])

In [None]:
draw_sample_processing(df, show=True)

# Word2Vec - Word Embedding

In [None]:
# print available word2vec models
import gensim.downloader as api
print(list(api.info()['models'].keys()))

In [None]:
if os.path.exists(W2V_PATH):
    print("Loading Word2Vec model...")
    w2v_model = gensim.models.KeyedVectors.load(W2V_PATH, mmap='r')
else:
    print("Downloading Word2Vec model...")
    w2v_model = api.load("word2vec-google-news-300")
    print("Saving Word2Vec model...")
    w2v_model.save(W2V_PATH)
    

In [None]:
EMBEDDING_VECTOR_DIM = w2v_model.vector_size

# DataSet

In [None]:
class Twitter(Dataset):
    def __init__(self, dataframe: pd.DataFrame, w2v_model: gensim.models.KeyedVectors, aggregate=True, **kw_args):
        self.dataframe = dataframe
        self.w2v_model = w2v_model
        self.aggregate = aggregate
        self.max_sequence_len = kw_args.get("sequence_len", 512)
        if not aggregate and "sequence_len" not in kw_args:
            print("You Should provide 'sequence_len' to use not aggregate option!")
            exit(1)

        self.len = len(self.dataframe)
        self.vector_size = w2v_model.vector_size
        self.lazy_mode = kw_args.get("lazy", True)

        self._proc_dataset()

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        if self.lazy_mode:
            vector = self._get_word_vectors(self.dataframe.iloc[idx]["tokens"])
            return vector, self.dataframe.iloc[idx]["intention"]
        return self.dataframe.iloc[idx]["vector"], self.dataframe.iloc[idx]["intention"]

    def get_vector_size(self):
        return self.vector_size

    def _proc_dataset(self):
        self.dataframe["tokens"] = self.dataframe["tweet"].map(preprocess_data)
        
        # delete samples with empty tokens
        lwz = len(self.dataframe)
        self.dataframe = self.dataframe[self.dataframe["tokens"].map(len) > 0]
        self.dataframe.reset_index(drop=True, inplace=True)
        print(f"Deleted 0-Len Samples: {lwz - len(self.dataframe)}")
        self.seq_report()

        if not self.aggregate:
            self.dataframe["tokens"] = self.dataframe["tokens"].map(self._pad)

        if not self.lazy_mode:
            self.dataframe["vector"] = self.dataframe["tokens"].map(self._get_word_vectors)

    def _get_word_vectors(self, tokens: list) -> torch.tensor:
        if not self.aggregate:
            return torch.stack([
                torch.tensor(self.w2v_model[token] if token in self.w2v_model else np.zeros(self.vector_size)) for token in tokens
            ])
        wv = np.zeros(self.vector_size)
        vc = 0
        for token in tokens:
            if token in self.w2v_model:
                wv += self.w2v_model[token]
                vc += 1
        wv = torch.tensor(wv / max(vc, 1), dtype=torch.float32)
        return wv

    def _pad(self, tokens: list):
        if len(tokens) >= self.max_sequence_len:
            return tokens[:self.max_sequence_len]
        return tokens + [" "] * (self.max_sequence_len - len(tokens))

    def seq_report(self):
        length_all = self.dataframe["tokens"].map(len).tolist()
        max_length = np.max(length_all)
        print(f"Sequence Length Report")
        print(f":::::MAX  LENGTH:::[{max_length:^5}]")
        print(f":::::MIN  LENGTH:::[{np.min(length_all):^5}]")
        print(f":::::MEAN LENGTH:::[{np.mean(length_all):^5}]")

        all_tokens = set()
        for token_set in self.dataframe["tokens"].tolist():
            all_tokens = all_tokens.union(set(token_set))
        unique_tokens_count = len(all_tokens)
        valid_tokens = sum(1 if token in self.w2v_model else 0 for token in all_tokens)
        print("Sequence Tokenization Report")
        print(f":::::All Unique Tokens:::[{unique_tokens_count:^6}")
        print(f":::::All Valid Tokens:::[{valid_tokens:^6}")
        print(f":::::Valid Tokens:::[{round(100*valid_tokens/unique_tokens_count, 2):^5}%]")
        
        print()

    @staticmethod
    def _to_tensor(tokens: list):
        return torch.tensor(tokens, dtype=torch.float32)

# Prepare Data

## Split Data into train-valid

In [None]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["intention"])

train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

## Create Datasets

In [None]:
train_dataset = Twitter(train_df, w2v_model)
valid_dataset = Twitter(valid_df, w2v_model)

print(f"Train dataset length: {len(train_dataset)}")
print(f"Valid dataset length: {len(valid_dataset)}")

# Model

## LSTM

In [None]:
class DetectorLSTM(nn.Module):
    def __init__(self, input_size, sequence_size, hidden_size, num_layers, num_classes, dropout=0.1):
        super(DetectorLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

## 2-Layer LSTM

## CNN+2-Layer-LSTM