In [1]:
!pip install clearml > /dev/null 2>&1

In [2]:
import os
import string
import re

import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [15]:
df = pd.read_csv('/content/df_title.csv', names=['id', 'title', 'class'], header=None, skiprows=1)

In [16]:
df.drop(columns='id', inplace=True)

In [18]:
X_clean_title = df

In [19]:
X_clean_title.head()

Unnamed: 0,title,class
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1.0
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1.0
2,"Bobby Jindal, raised Hindu, uses story of Chri...",0.0
3,SATAN 2: Russia unvelis an image of its terrif...,1.0
4,About Time! Christian Group Sues Amazon and SP...,1.0


In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
stopwords = set(nltk.corpus.stopwords.words('english'))
porter_steemer = nltk.stem.PorterStemmer()

In [23]:
def remove_urls(text):
    URL_REGEX = r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    return re.sub(URL_REGEX, '', text)

def remove_non_alphabetical_characters(text):
    return re.sub('[^a-zA-Z]', ' ', text)

def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in stopwords])
    
def stem_words(text):
    words = text.split()
    return ' '.join([porter_steemer.stem(word) for word in words])
  
def transform_text(text):
    text = remove_urls(text)
    text = remove_non_alphabetical_characters(text)
    text = text.lower()
    text = remove_stopwords(text)
    text = stem_words(text)
    
    return text

In [24]:
X_clean_title['title'] = X_clean_title['title'].apply(transform_text)

In [26]:
X_clean_title = X_clean_title[X_clean_title['title'].str.len() > 0]

In [28]:
X_clean_title.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [29]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X_clean_title['title'], X_clean_title['class'], test_size=0.33, random_state=256, stratify=X_clean_title['class'])

In [30]:
split = []
for index, row in X_clean_title.iterrows():
    if index in X_train.index:
        split.append('train')
    else:
        split.append('test')
X_clean_title['split'] = split

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [31]:
PADDING_VALUE = 0

class NaiveVectorizer:
    def __init__(self, tokenized_data, **kwargs):
        tokenized_data = [seq.split() for seq in tokenized_data]
        self.wv = dict()
        iter = PADDING_VALUE + 1
        for sequence in tokenized_data:
          for word in sequence:
            if word not in self.wv:
              self.wv[word] = iter
              iter += 1

    def vectorize(self, tokenized_seq):
        tokens = []
        for word in tokenized_seq:
          if word in self.wv:
            tokens.append(self.wv[word])
            
        return torch.LongTensor(tokens)

In [32]:
class DatasetNews(Dataset):
    SPLIT_TYPES = ["train", "test"]

    def __init__(self, data, preprocess_fn, split="train"):
        super(DatasetNews, self).__init__()
        if split not in self.SPLIT_TYPES:
            raise AttributeError(f"No such split type: {split}")

        self.split = split
        self.label = [i for i, c in enumerate(data.columns) if c == "class"][0]
        self.data_col = [i for i, c in enumerate(data.columns) if c == "title"][0]
        self.data = data[data["split"] == self.split]
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.preprocess_fn(self.data.iloc[idx, self.data_col].split())
        label = self.data.iloc[idx, self.label]
        return (seq, label)

In [33]:
from torch.nn.utils.rnn import pad_sequence
naive_vectorizer = NaiveVectorizer(X_clean_title.loc[X_clean_title["split"] == "train", "title"])

def get_datasets():
    train_dataset = DatasetNews(data=X_clean_title, preprocess_fn=naive_vectorizer.vectorize)
    test_dataset = DatasetNews(data=X_clean_title, preprocess_fn=naive_vectorizer.vectorize, split="test")
        
    return train_dataset, test_dataset

def custom_collate_fn(pairs):
    seqcs, lengths, labels = [], [], []
    for pair in pairs:
        if len(pair[0]) > 0:
          labels.append(pair[1])
          lengths.append(len(pair[0]))
          seqcs.append(pair[0])

    seqcs = pad_sequence(seqcs, padding_value=PADDING_VALUE)
    lengths = torch.LongTensor(lengths)
    labels = torch.Tensor(labels)
    return seqcs, lengths, labels

In [34]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, classes, batch_size, dropout_prob, num_layers):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers =num_layers, dropout=dropout_prob)
        self.linear = nn.Linear(hidden_dim, classes)
        self.drop = nn.Dropout(dropout_prob)

    def forward(self, sentence, lengths):
        batch_size = sentence.shape[1]
        embeddings = self.embedding(sentence)
        embeddings = self.drop(embeddings)
        packed_embeddings = pack_padded_sequence(embeddings, lengths.cpu(), enforce_sorted=False)
        lstm_out, _ = self.lstm(packed_embeddings)
        output, hidden = pad_packed_sequence(lstm_out)
        out = output[lengths - 1, range(batch_size) , :]
        x = self.linear(out)
        x = x.squeeze()
        scores = torch.sigmoid(x)
        return scores

In [37]:
from clearml import Task

In [39]:
web_server = 'https://app.community.clear.ml'
api_server = 'https://api.community.clear.ml'
files_server = 'https://files.community.clear.ml'
access_key = ""
secret_key = ""

Task.set_credentials(web_host=web_server,
                     api_host=api_server,
                     files_host=files_server,
                     key=access_key,
                     secret=secret_key)

In [40]:
task = Task.create(project_name='cloud', task_name='log1')
task.mark_started()
logger = task.get_logger()

In [41]:
config = {
    'n_epochs': 20,
    'lr': 1e-3,
    'batch_size': 128, 
    'optimizer': 'Adam',
    'hidden_dim': 128, 
    'embedding_dim': 50,
    'dropout': 0.5, 
    'num_layers': 2
}
task.connect(config)

{'batch_size': 128,
 'dropout': 0.5,
 'embedding_dim': 50,
 'hidden_dim': 128,
 'lr': 0.001,
 'n_epochs': 20,
 'num_layers': 2,
 'optimizer': 'Adam'}

In [42]:
def train(model, training_data, test_data, optimizer, device, loss_fn):

    for epoch in range(config['n_epochs']):

        print('[ Epoch', epoch, ']')
        
        start = time.time()
        train_loss, train_acc, train_f1, train_fpr = train_epoch(model, training_data, optimizer, device, loss_fn)
        print('  - (Training)   loss: {loss: 8.5f}, accuracy: {acc:3.3f} %, f1: {f1:3.3f}%, fpr: {fpr:3.3f}%, time: {time:3.3f} min'.format(
            loss = train_loss, acc=100*train_acc, f1=100*train_f1, fpr=100*train_fpr,
            time=(time.time()-start)/60))
        
        start = time.time()
        test_loss, test_acc, test_f1, test_fpr = eval_epoch(model, test_data, device, loss_fn)
        print('  - (Test)       loss: {loss: 8.5f}, accuracy: {acc:3.3f} %, f1: {f1:3.3f}%, fpr: {fpr:3.3f}%, time: {time:3.3f} min'.format(
            loss = test_loss, acc=100*test_acc, f1=100*test_f1, fpr=100*test_fpr,
            time=(time.time()-start)/60))

        logger.report_scalar(title='Loss', series='Train', iteration=epoch, value=train_loss)
        logger.report_scalar(title='Accuracy', series='Train', iteration=epoch, value=train_acc)
        logger.report_scalar(title='Loss', series='Test', iteration=epoch, value=test_loss)
        logger.report_scalar(title='Accuracy', series='Test', iteration=epoch, value=test_acc)
        logger.report_scalar(title='F1', series='Test', iteration=epoch, value=test_f1)
        logger.report_scalar(title='FPR', series='Test', iteration=epoch, value=test_fpr)
        logger.report_scalar(title='F1', series='Train', iteration=epoch, value=train_f1)
        logger.report_scalar(title='FPR', series='Train', iteration=epoch, value=train_fpr)

In [50]:
def train_epoch(model, training_data, optimizer, device, loss_fn):
    
        model.train()

        total_loss = 0
        n_examples_total, n_examples_correct = 0, 0

        for batch in tqdm(training_data, mininterval=2,desc='  - (Training)   ', leave=False):

            seqcs, lengths, labels = batch[0].to(device), batch[1], batch[2].to(device)
            # forward
            optimizer.zero_grad()
            pred = model(seqcs, lengths)

            # backward
            loss, n_correct = eval_performance(pred, labels, loss_fn)
            loss.backward()
            # Calculating False Positive Rate
            cf_matrix = confusion_matrix(labels.detach().cpu().numpy(), torch.round(pred).detach().cpu().numpy())
            tn, fp, fn, tp = cf_matrix.ravel()
            fpr = fp / (fp + tn)

            # Calculating F1-score
            f1 = f1_score(labels.detach().cpu().numpy(), torch.round(pred).detach().cpu().numpy())

            # update parameters
            optimizer.step()

            total_loss += loss.item()

            n_examples_total += batch[2].size()[0]
            n_examples_correct += n_correct
            # print(f'Loss: {loss.item()}, acc: {n_correct/batch[2].size()[0]}')

        accuracy = n_examples_correct/n_examples_total

        return total_loss, accuracy, f1, fpr

In [51]:
def eval_epoch(model, test_data, device, loss_fn):

    model.eval()

    total_loss = 0
    n_examples_total, n_examples_correct = 0, 0

    with torch.no_grad():
        for batch in tqdm(test_data, mininterval=2, desc='  - (Test) ', leave=False):

            seqcs, lengths, labels = batch[0].to(device), batch[1], batch[2].to(device)
            
            # forward
            pred = model(seqcs, lengths)
            loss, n_correct = eval_performance(pred, labels, loss_fn)
            # Calculating False Positive Rate
            cf_matrix = confusion_matrix(labels.detach().cpu().numpy(), torch.round(pred).detach().cpu().numpy())
            tn, fp, fn, tp = cf_matrix.ravel()
            fpr = fp / (fp + tn)

            # Calculating F1-score
            f1 = f1_score(labels.detach().cpu().numpy(), torch.round(pred).detach().cpu().numpy())

            total_loss += loss.item()
            
            n_examples_total += batch[2].size()[0]
            n_examples_correct += n_correct

    accuracy = n_examples_correct/n_examples_total

    return total_loss, accuracy, f1, fpr

In [45]:
def eval_performance(prediction, ground_truth, loss_fn):
    
    loss = loss_fn(prediction, ground_truth)

    n_correct = (torch.round(prediction) == ground_truth.to(torch.int32)).sum().item()

    return loss, n_correct

In [46]:
training_data, test_data = get_datasets()
train_dataloader = DataLoader(training_data, batch_size=config['batch_size'], shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_data, batch_size=config['batch_size'], shuffle=True, collate_fn=custom_collate_fn)

In [52]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(embedding_dim=config['embedding_dim'], hidden_dim=config['hidden_dim'], vocab_size=len(naive_vectorizer.wv)+1, classes=1, batch_size=config['batch_size'], dropout_prob=config['dropout'], num_layers=config['num_layers']).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=config['lr'])
loss = nn.BCELoss()
train(model=model, training_data=train_dataloader, test_data=test_dataloader, optimizer=optimizer, device=DEVICE, loss_fn=loss)


[ Epoch 0 ]




  - (Training)   loss:  149.65548, accuracy: 73.737 %, f1: 76.190%, fpr: 15.789%, time: 0.092 min




  - (Test)       loss:  55.13934, accuracy: 83.166 %, f1: 85.714%, fpr: 12.500%, time: 0.032 min
[ Epoch 1 ]




  - (Training)   loss:  116.02960, accuracy: 81.917 %, f1: 87.805%, fpr: 13.636%, time: 0.092 min




  - (Test)       loss:  48.76224, accuracy: 85.326 %, f1: 88.462%, fpr: 23.810%, time: 0.032 min
[ Epoch 2 ]




  - (Training)   loss:  104.95052, accuracy: 84.597 %, f1: 70.270%, fpr: 21.739%, time: 0.092 min




  - (Test)       loss:  44.66654, accuracy: 87.385 %, f1: 90.000%, fpr: 8.000%, time: 0.032 min
[ Epoch 3 ]




  - (Training)   loss:  98.09499, accuracy: 85.839 %, f1: 87.179%, fpr: 9.091%, time: 0.120 min




  - (Test)       loss:  42.99650, accuracy: 87.966 %, f1: 93.548%, fpr: 20.000%, time: 0.032 min
[ Epoch 4 ]




  - (Training)   loss:  92.00742, accuracy: 86.908 %, f1: 91.667%, fpr: 11.111%, time: 0.092 min




  - (Test)       loss:  41.60111, accuracy: 88.302 %, f1: 100.000%, fpr: 0.000%, time: 0.032 min
[ Epoch 5 ]




  - (Training)   loss:  88.31726, accuracy: 87.506 %, f1: 88.372%, fpr: 18.182%, time: 0.139 min




  - (Test)       loss:  39.61680, accuracy: 89.025 %, f1: 86.667%, fpr: 9.677%, time: 0.035 min
[ Epoch 6 ]




  - (Training)   loss:  84.41592, accuracy: 88.211 %, f1: 96.296%, fpr: 0.000%, time: 0.093 min




  - (Test)       loss:  39.89922, accuracy: 88.908 %, f1: 94.545%, fpr: 11.111%, time: 0.050 min
[ Epoch 7 ]




  - (Training)   loss:  81.15137, accuracy: 88.821 %, f1: 91.304%, fpr: 15.000%, time: 0.122 min




  - (Test)       loss:  38.07328, accuracy: 89.586 %, f1: 91.667%, fpr: 13.636%, time: 0.032 min
[ Epoch 8 ]




  - (Training)   loss:  77.96278, accuracy: 89.350 %, f1: 90.909%, fpr: 10.000%, time: 0.092 min




  - (Test)       loss:  39.14331, accuracy: 89.356 %, f1: 93.878%, fpr: 0.000%, time: 0.032 min
[ Epoch 9 ]




  - (Training)   loss:  75.71261, accuracy: 89.696 %, f1: 82.353%, fpr: 25.000%, time: 0.093 min




  - (Test)       loss:  37.74104, accuracy: 89.728 %, f1: 92.683%, fpr: 8.000%, time: 0.032 min
[ Epoch 10 ]




  - (Training)   loss:  73.10457, accuracy: 90.241 %, f1: 93.333%, fpr: 10.000%, time: 0.118 min




  - (Test)       loss:  37.96866, accuracy: 89.988 %, f1: 89.796%, fpr: 10.000%, time: 0.033 min
[ Epoch 11 ]




  - (Training)   loss:  70.63579, accuracy: 90.600 %, f1: 93.333%, fpr: 5.263%, time: 0.098 min




  - (Test)       loss:  37.45251, accuracy: 90.218 %, f1: 97.297%, fpr: 3.704%, time: 0.033 min
[ Epoch 12 ]




  - (Training)   loss:  68.72892, accuracy: 90.748 %, f1: 91.304%, fpr: 19.048%, time: 0.095 min




  - (Test)       loss:  36.98653, accuracy: 90.401 %, f1: 93.617%, fpr: 13.043%, time: 0.032 min
[ Epoch 13 ]




  - (Training)   loss:  67.32765, accuracy: 90.934 %, f1: 75.000%, fpr: 15.000%, time: 0.095 min




  - (Test)       loss:  36.29257, accuracy: 90.386 %, f1: 83.333%, fpr: 14.286%, time: 0.032 min
[ Epoch 14 ]




  - (Training)   loss:  65.95870, accuracy: 91.315 %, f1: 85.714%, fpr: 12.000%, time: 0.094 min




  - (Test)       loss:  37.60416, accuracy: 90.105 %, f1: 97.959%, fpr: 4.762%, time: 0.032 min
[ Epoch 15 ]




  - (Training)   loss:  62.98937, accuracy: 91.614 %, f1: 92.308%, fpr: 3.448%, time: 0.095 min




  - (Test)       loss:  38.54480, accuracy: 90.416 %, f1: 86.486%, fpr: 4.000%, time: 0.032 min
[ Epoch 16 ]




  - (Training)   loss:  61.65721, accuracy: 91.890 %, f1: 86.667%, fpr: 7.407%, time: 0.095 min




  - (Test)       loss:  34.89068, accuracy: 90.819 %, f1: 88.372%, fpr: 12.500%, time: 0.032 min
[ Epoch 17 ]




  - (Training)   loss:  61.55452, accuracy: 91.867 %, f1: 90.909%, fpr: 18.182%, time: 0.093 min




  - (Test)       loss:  36.18091, accuracy: 90.763 %, f1: 90.196%, fpr: 10.526%, time: 0.033 min
[ Epoch 18 ]




  - (Training)   loss:  59.23915, accuracy: 92.100 %, f1: 85.000%, fpr: 17.391%, time: 0.103 min




  - (Test)       loss:  37.46159, accuracy: 90.747 %, f1: 86.275%, fpr: 23.810%, time: 0.032 min
[ Epoch 19 ]




  - (Training)   loss:  58.03652, accuracy: 92.429 %, f1: 93.333%, fpr: 14.286%, time: 0.093 min


                                                    

  - (Test)       loss:  38.17297, accuracy: 90.442 %, f1: 95.833%, fpr: 0.000%, time: 0.033 min




In [53]:
task.mark_completed()
task.close()

2022-06-21 21:35:38,994 - clearml.Task - INFO - Waiting to finish uploads
2022-06-21 21:35:39,100 - clearml.Task - INFO - Finished uploading
