In [2]:
!pip install gensim -q

In [2]:
import string

import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary

import gensim
from IPython.display import clear_output

import torchvision
from torchvision import transforms

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import gc
import os

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
MODE = "local" # valid modes are 'local', 'colab' and 'kaggle'
SAMPLE_NUMBERS = 4

In [None]:
if MODE == "local":
    BASE_PATH = "."
else:
    print(f"Unknown mode {MODE}")
    exit(1)

In [None]:
DATA_PATH = os.path.join(BASE_PATH, "twitter-suicidal_data.csv")

W2V_PATH = os.path.join(BASE_PATH, "w2v_vectors.kv")


# Data Exploration

## Load Data

In [None]:
df = pd.read_csv(DATA_PATH)

## Data statistics

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
print(f"Number of samples: {len(df)}")

In [None]:
intentions = df["intention"].value_counts()

plt.figure(figsize=(7, 5))
sns.barplot(x=intentions.index, y=intentions.values)
plt.title("Intentions Distribution")
plt.show()

# Data Preprocessing

In [None]:
import nltk
import emoji
import re

nltk.download(["stopwords", "punkt", "wordnet"])

In [None]:
def convert_emoji_to_text(text):
    return emoji.demojize(text, delimiters=(" ", " "))

stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = nltk.stem.WordNetLemmatizer()

def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return 'a'
    elif nltk_tag.startswith('V'):
        return 'v'
    elif nltk_tag.startswith('N'):
        return 'n'
    elif nltk_tag.startswith('R'):
        return 'r'
    else:          
        return 'n'

def preprocess_data(text: str):
    """
    Preprocessing steps are as follows:
    0. concatenation of the text (not necessary)
    1. lowercase the text
    2. remove punctuation
    3. remove numbers
    4. remove urls
    5. remove usernames
    6. remove extra spaces
    7. convert emojis to text
    8. remove non-word characters
    9. lemmatization and tokenization of the text
    10. remove stopwords
    :param text: str
    :return: tokens: list[str]
    """
    # text = ''.join(text)
    # lowercase the text
    text = text.lower()
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove numbers
    text = re.sub(r"\d+", "", text)
    # remove urls,
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # remove usernames
    text = re.sub(r'@\S+', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # convert emojis to text
    text = convert_emoji_to_text(text)
    # remove non-word characters
    text = re.sub(r"[^\w\s]", "", text)
    # lemmatization and tokenization of the text
    tokenized = nltk.word_tokenize(text)
    pos_tag = nltk.pos_tag(tokenized)
    tokens = [
        lemmatizer.lemmatize(token, nltk_pos_tagger(tag))
        for token, tag in pos_tag
    ]
    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords]
    
    return tokens
    

In [None]:
def format_length_bound(text: str, length:int):
    if len(text) <= length:
        return text
    mm = length // 2 - 3
    rr = length - mm - 5
    return text[:mm] + "|...|" + text[-rr:]
    
def draw_sample_processing(dataframe: pd.DataFrame, sample_numbers: int = SAMPLE_NUMBERS, show=True):
    sample_text = dataframe.sample(sample_numbers)
    label_rawT_procT = [
        (label, tweet, preprocess_data(tweet)) 
        for tweet, label in zip(sample_text["tweet"], sample_text["intention"])
    ]
    if show:
        for idx, (label, rawT, procT) in enumerate(label_rawT_procT, start=1):
            procT = str(procT)
            print(f"Sample {idx}:")
            print(f":::::::Label          : {label}")
            print(f":::::::Raw Tweet      : {format_length_bound(rawT, 50):<50} ({len(rawT)})")
            print(f":::::::Processed Tweet: {format_length_bound(procT, 50):<50} ({len(procT)})")
        return 
    return pd.DataFrame(label_rawT_procT, columns=["Intention", "Raw Tweet", "Processed Tweet"])

In [None]:
draw_sample_processing(df, show=True)

# Word2Vec - Word Embedding

In [3]:
# print available word2vec models
import gensim.downloader as api
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
if os.path.exists(W2V_PATH):
    print("Loading Word2Vec model...")
    w2v_model = gensim.models.KeyedVectors.load(W2V_PATH, mmap='r')
else:
    print("Downloading Word2Vec model...")
    w2v_model = api.load("word2vec-google-news-300")
    print("Saving Word2Vec model...")
    w2v_model.save(W2V_PATH)
    

In [None]:
EMBEDDING_VECTOR_DIM = w2v_model.vector_size

# Data Set

In [None]:
class Twitter(Dataset):
    def __init__(self, dataframe: pd.DataFrame, w2v_model: gensim.models.KeyedVectors, max_length: int = 50):
        self.dataframe = dataframe
        self.w2v_model = w2v_model
        self.max_length = max_length
        self.dataframe["tokens"] = self.dataframe["tweet"].apply(preprocess_data)
        self.dataframe["tokens"] = self.dataframe["tokens"].apply(self._get_word_vectors)
        self.dataframe["tokens"] = self.dataframe["tokens"].apply(self._pad)
        self.dataframe["tokens"] = self.dataframe["tokens"].apply(self._to_tensor)
        
        self.len = len(self.dataframe)
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]["tokens"], self.dataframe.iloc[idx]["intention"]
    
    def _get_word_vectors(self, tokens: list):
        return [self.w2v_model[token] for token in tokens if token in self.w2v_model]
    
    def _pad(self, tokens: list):
        if len(tokens) >= self.max_length:
            return tokens[:self.max_length]
        return tokens + [np.zeros(EMBEDDING_VECTOR_DIM)] * (self.max_length - len(tokens))
    
    def _to_tensor(self, tokens: list):
        return torch.tensor(tokens, dtype=torch.float32)