In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/clean_text.csv')
df

Unnamed: 0,text,class,clean_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,ex wife threaten suicid recent left wife good ...
1,Am I weird I don't get affected by compliments...,non-suicide,weird get affect compliment come someon know i...
2,Finally 2020 is almost over... So I can never ...,non-suicide,final almost never hear bad year ever swear fu...
3,i need helpjust help me im crying so hard,suicide,need helpjust help im cri hard
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,lost hello name adam struggl year afraid past ...
...,...,...,...
232069,If you don't like rock then your not going to ...,non-suicide,like rock go get anyth go
232070,You how you can tell i have so many friends an...,non-suicide,tell mani friend lone everyth depriv pre bough...
232071,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide,pee probabl tast like salti tea someon drank p...
232072,The usual stuff you find hereI'm not posting t...,suicide,usual stuff find post sympathi piti know far w...


In [3]:
data = df['clean_text'].astype(str).to_numpy()
label = df['class']

In [4]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\84359\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\84359\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
PADDING = '<PAD>'
UNKNOWN = '<BRUH>'
def tokenize(text):
    return word_tokenize(text)

data = [tokenize(text) for text in data]
data
modified_data = data.copy()
modified_data.append([UNKNOWN, UNKNOWN, UNKNOWN, UNKNOWN, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING])

In [6]:
def build_vocabulary(data):
    vocabulary = {PADDING: 0, UNKNOWN: 1}
    for text in data:
        for word in text:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)
    return vocabulary

vocab = build_vocabulary(data)
vocab

{'<PAD>': 0,
 '<BRUH>': 1,
 'ex': 2,
 'wife': 3,
 'threaten': 4,
 'suicid': 5,
 'recent': 6,
 'left': 7,
 'good': 8,
 'cheat': 9,
 'twice': 10,
 'lie': 11,
 'much': 12,
 'decid': 13,
 'refus': 14,
 'go': 15,
 'back': 16,
 'day': 17,
 'ago': 18,
 'began': 19,
 'tirelessli': 20,
 'spent': 21,
 'paat': 22,
 'talk': 23,
 'keep': 24,
 'hesit': 25,
 'want': 26,
 'believ': 27,
 'come': 28,
 'know': 29,
 'lot': 30,
 'peopl': 31,
 'order': 32,
 'get': 33,
 'way': 34,
 'happen': 35,
 'realli': 36,
 'suppos': 37,
 'handl': 38,
 'death': 39,
 'hand': 40,
 'still': 41,
 'love': 42,
 'can': 43,
 'not': 44,
 'deal': 45,
 'constantli': 46,
 'feel': 47,
 'insecur': 48,
 'worri': 49,
 'today': 50,
 'may': 51,
 'hope': 52,
 'weird': 53,
 'affect': 54,
 'compliment': 55,
 'someon': 56,
 'irl': 57,
 'internet': 58,
 'stranger': 59,
 'final': 60,
 'almost': 61,
 'never': 62,
 'hear': 63,
 'bad': 64,
 'year': 65,
 'ever': 66,
 'swear': 67,
 'fuck': 68,
 'god': 69,
 'annoy': 70,
 'need': 71,
 'helpjust': 72,


In [7]:
MAX_LENGTH = 200
def numericalize(data, vocabulary):
    return [[vocabulary.get(word, vocabulary[UNKNOWN]) for word in text] for text in data]
def truncate(data, length):
    return [text[:length] if len(text) > length else text for text in data]
def pad(data, length):
    return [text[:length] + [0] * (length - len(text)) for text in data]

numericalized = numericalize(data, vocab)
truncated = truncate(numericalized, MAX_LENGTH)
padded = pad(truncated, MAX_LENGTH)
padded = np.array(padded)
padded

array([[   2,    3,    4, ...,    0,    0,    0],
       [  53,   33,   54, ...,    0,    0,    0],
       [  60,   61,   62, ...,    0,    0,    0],
       ...,
       [8043,  250,  328, ...,    0,    0,    0],
       [ 296, 1296,  153, ...,    0,    0,    0],
       [  41, 3646,  129, ...,    0,    0,    0]])

In [8]:
from gensim.models import Word2Vec
import gensim

In [9]:
model = Word2Vec(modified_data, min_count=1, vector_size=150, window=5, workers=4)
model.save("word2vec.model")

In [10]:
from gensim.models import KeyedVectors
w2v = Word2Vec.load("word2vec.model")
word_vectors = w2v.wv
del model
del w2v

In [11]:
# test wv
word_vectors


<gensim.models.keyedvectors.KeyedVectors at 0x2c3e2cad190>

In [12]:
import torch
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

In [13]:
class KimCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, num_classes, dropout=0.5):
        super(KimCNN, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word_vectors.vectors), freeze=True)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
        
    def forward(self, x):
        # for input of size (batch_size, sentence_length, embedding_dim)
        x = self.embedding(x)
        x = x.unsqueeze(1)
        # -> (batch_size, 1, sentence_length, embedding_dim)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        # -> [(batch_size, num_filters, sentence_length - fs + 1) for fs in filter_sizes]
        x = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in x]
        # -> [(batch_size, num_filters) for fs in filter_sizes]
        x = torch.cat(x, 1)
        # -> (batch_size, num_filters * len(filter_sizes))
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded, label, test_size=0.2, random_state=42)
y_train = [0 if x == 'non-suicide' else 1 for x in y_train]
y_test = [0 if x == 'non-suicide' else 1 for x in y_test]

In [15]:
class MyDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, 64, shuffle=True)
test_dataloader = DataLoader(test_dataset, 64, shuffle=False)

In [16]:
print(len(word_vectors.vectors))
print(len(vocab))

94869
94869


In [17]:
from torchinfo import summary
model = KimCNN(len(vocab), 150, 100, [1, 2, 3, 4, 5, 6], 2)
summary(model)

Layer (type:depth-idx)                   Param #
KimCNN                                   --
├─Embedding: 1-1                         (14,230,350)
├─ModuleList: 1-2                        --
│    └─Conv2d: 2-1                       15,100
│    └─Conv2d: 2-2                       30,100
│    └─Conv2d: 2-3                       45,100
│    └─Conv2d: 2-4                       60,100
│    └─Conv2d: 2-5                       75,100
│    └─Conv2d: 2-6                       90,100
├─Dropout: 1-3                           --
├─Linear: 1-4                            1,202
Total params: 14,547,152
Trainable params: 316,802
Non-trainable params: 14,230,350

In [18]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [19]:
import time
NUM_EPOCHS = 15
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
best_val_loss = 999

model.to(DEVICE)

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    model.train() # Turn on train mode for gradient updates
    train_loss = 0
    train_accuracy = 0
    for data, target in train_dataloader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        # Update metrics
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        train_accuracy += pred.eq(target.view_as(pred)).sum().item()
    train_loss /= len(train_dataloader.dataset)
    train_accuracy /= len(train_dataloader.dataset)
    time_taken = time.time() - start_time

    model.eval() # Turn on train mode to disable gradient updates
    test_loss = 0
    test_accuracy = 0
    with torch.no_grad():
        for data, target in test_dataloader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            loss = loss_fn(output, target)
            # Update metrics
            test_loss += loss
            pred = output.argmax(dim = 1, keepdim = True)
            test_accuracy += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_dataloader.dataset)
    test_accuracy /= len(test_dataloader.dataset)

    # Log metrics
    print(
        f"Epoch {epoch}: Train Loss: {train_loss:.4f} | \
        Train Accuracy: {train_accuracy:.4f} | \
        Test Loss: {test_loss:.4f} | \
        Test Accuracy: {test_accuracy:.4f} | \
        Time taken: {time_taken:.4f}"
    )

Epoch 1: Train Loss: 0.0052 |         Train Accuracy: 0.8607 |         Test Loss: 0.0038 |         Test Accuracy: 0.9055 |         Time taken: 34.4080
Epoch 2: Train Loss: 0.0040 |         Train Accuracy: 0.8993 |         Test Loss: 0.0035 |         Test Accuracy: 0.9131 |         Time taken: 33.3091
Epoch 3: Train Loss: 0.0036 |         Train Accuracy: 0.9093 |         Test Loss: 0.0033 |         Test Accuracy: 0.9178 |         Time taken: 33.4249
Epoch 4: Train Loss: 0.0034 |         Train Accuracy: 0.9150 |         Test Loss: 0.0033 |         Test Accuracy: 0.9207 |         Time taken: 33.3016
Epoch 5: Train Loss: 0.0032 |         Train Accuracy: 0.9201 |         Test Loss: 0.0033 |         Test Accuracy: 0.9215 |         Time taken: 33.3890
Epoch 6: Train Loss: 0.0031 |         Train Accuracy: 0.9243 |         Test Loss: 0.0032 |         Test Accuracy: 0.9226 |         Time taken: 33.5029
Epoch 7: Train Loss: 0.0030 |         Train Accuracy: 0.9270 |         Test Loss: 0.0031 |    

In [21]:
torch.save(model.state_dict(), 'kim_cnn.pth')
torch.save(vocab, 'vocab.pth')
torch.save(word_vectors, 'word_vectors.pth')