## DataLoading and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import json
import nltk
from nltk.corpus import stopwords
stpwrds = set(stopwords.words('english'))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import re

def preprocess_text(text):
    text = re.sub("\n", "", text)
    text = re.sub("https://(www)?.\w+.\w+\S*", "", text)
    text = re.sub("\d+", "", text)
    return text

In [4]:
def preprocess_df(path):
    
    data = pd.read_csv("train.tsv", delimiter="\t")

    data["alchemy_category_score"] = data["alchemy_category_score"].replace("?", 0)
    data["alchemy_category_score"] = data["alchemy_category_score"].astype("float")
    zero_vals = data["alchemy_category_score"].value_counts()[0]
    mean_val = sum(np.array(data["alchemy_category_score"]))/(len(data)-zero_vals)
    data["alchemy_category_score"] = data["alchemy_category_score"].replace(0.000000, mean_val)
    
    data = pd.get_dummies(data, columns=["alchemy_category"], prefix=["alchemy"])
    data = data.drop("alchemy_?", axis=1)
    data = pd.get_dummies(data, columns=["news_front_page"])
    data = data.drop("news_front_page_?", axis=1)
    data = pd.get_dummies(data, columns=["is_news"])
    data = data.drop("is_news_?", axis=1)

    data["temp"] = [json.loads(boiler) for boiler in data["boilerplate"]]
    data["title"] = [boiler.get("title", "") for boiler in data["temp"]]
    data["body"] = [boiler.get("body", "") for boiler in data["temp"]]
    
    df = data
    for i in range(len(df)):
        if(df["title"][i]==None):
            df["title"][i]=""
        if(df["body"][i]==None):
            df["body"][i]=""
        else:
            df["title"][i] = preprocess_text(df["title"][i])
            df["body"][i] = preprocess_text(df["body"][i])

    df["text"] = [title + body for title, body in zip(df["title"], df["body"])]
    df["text"] = [text.split(" ") for text in df["text"]]
    df["text"] = [[word for word in text if word not in stpwrds] for text in df["text"]]
    df["text"] = [text[:300] for text in df["text"]]
    for i in range(len(df)):
        if(len(df["text"][i])<300):
            num = 300-len(df["text"][i])
            df["text"][i] += "<pad>"*num
    df["text"] = [" ".join(text) for text in df["text"]]
    df = df.drop(["temp", "boilerplate", "url", "title", "body"], axis=1)
    
    return df

In [5]:
training_data = preprocess_df("train.tsv")
test_data = preprocess_df("test.tsv")

In [6]:
training_data.head(5)

Unnamed: 0,urlid,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,...,alchemy_recreation,alchemy_religion,alchemy_science_technology,alchemy_sports,alchemy_unknown,alchemy_weather,news_front_page_0,news_front_page_1,is_news_1,text
0,4042,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,...,0,0,0,0,0,0,1,0,1,IBM Sees Holographic Calls Air Breathing Batte...
1,8471,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,0.468649,0.0,0,...,1,0,0,0,0,0,1,0,1,The Fully Electronic Futuristic Starting Gun T...
2,1164,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,...,0,0,0,0,0,0,1,0,1,Fruits Fight Flu fruits fight flu | cold & flu...
3,6684,0.801248,1.543103,0.4,0.1,0.016667,0.0,0.480725,0.0,0,...,0,0,0,0,0,0,1,0,1,Foolproof Tips Better Sleep There period life...
4,9006,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,0.446143,0.0,0,...,0,0,0,1,0,0,1,0,1,The Coolest Jerseys You Didn Know Existed coo...


## Creating datasets and dataloaders

In [7]:
import torch
from torch.utils.data import DataLoader, Dataset
import torchtext
from torchtext.data import Field
# from torchtext.data.utils import get_tokenizer

In [8]:
train_df = training_data.sample(frac=0.8)
val_df = training_data.drop(train_df.index)

In [9]:
# tokenizer = get_tokenizer('spacy', language='en')
text_field = Field(
    tokenize="basic_english", 
    lower=True
)

preprocessed_text = train_df['text'].apply(lambda x: text_field.preprocess(x))

text_field.build_vocab(
    preprocessed_text, 
    vectors = "glove.6B.100d",
    unk_init = torch.Tensor.normal_,
    min_freq=4
)


# get the vocab instance
vocab = text_field.vocab
print(len(vocab))

18802


In [10]:
class CustomDataset(Dataset):
    def __init__(self, df):
        
        df_num = df.drop(["text"], axis=1)
        df_num = df_num.drop(["label"], axis=1)
        
        num_data = [c.values for n,c in df_num.items()]
        
        self.texts = df.text.values
        self.num_data = np.stack(num_data, 1).astype(np.float32) 
        self.target = df.label.values

    def __len__(self): 
        return len(self.target)
    
    def __getitem__(self, idx):

        return [self.texts[idx], self.num_data[idx], self.target[idx]]

In [11]:
trn_ds = CustomDataset(train_df)
val_ds = CustomDataset(val_df)
test_ds = CustomDataset(test_data)

bs = 64
train_dl = DataLoader(trn_ds, bs, shuffle=True)
val_dl = DataLoader(val_ds, bs, shuffle=False)
test_dl = DataLoader(val_ds, bs, shuffle=True)

## Model

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x228861a7888>

In [13]:
class classifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, vocab_vectors, hidden, numerical_features):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.emb.weight = nn.Parameter(vocab_vectors)
        
        self.lstm = nn.LSTM(emb_dim, hidden)
        
        self.fc1 = nn.Linear(hidden+numerical_features+1, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32,1)
        self.sig = nn.Sigmoid()
        
    def forward(self, text, num_ip):
        text = torch.tensor([vocab[word] for word in text[0].split(" ")], dtype=torch.long)
        text = self.emb(text)
        sh1, sh2 = text.shape
        text = text.view(1, sh1, sh2)
        text = self.lstm(text)
        _, sh3, sh4 = text[0].shape
        text_ip = text[0].view(sh3, sh4)
        
        if(text_ip.shape[0]!=num_ip.shape[0]):
            diff = text_ip.shape[0] - num_ip.shape[0]
            num_ip  = F.pad(num_ip, (0,0,0,diff), "constant", 0)
        
        ip = torch.cat((text_ip, num_ip), dim=1)
        
        ip = self.relu(self.fc1(ip))
        
        op = self.sig(self.fc2(ip))
        
        return op

In [14]:
vocab_s = len(vocab)
embedding_dim = 100
hidden_dim = 64
numerical_features = 36
vector_for_vocab = vocab.vectors

model = classifier(vocab_s, embedding_dim, vector_for_vocab, hidden_dim, numerical_features)

In [15]:
print(model)

classifier(
  (emb): Embedding(18802, 100)
  (lstm): LSTM(100, 64)
  (fc1): Linear(in_features=101, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,925,993 trainable parameters


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

import torch.optim as optim


#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
# criterion = criterion.to(device)

## Training and Validation

In [18]:
def train_model(model, train_dl, val_dl, n_epochs=100, lr=1e-3):
        "Run training loops."
        epochs = n_epochs
        opt = optim.Adam(model.parameters(), lr=lr)
        loss_func = nn.MSELoss()

        try:
            for epoch in range(epochs):
                model.train()
                for text, num, target in train_dl:
                    preds = model(text, num)
                    loss = loss_func(preds, target.float())
                    
                    loss.backward()
                    opt.step()
                    opt.zero_grad()
                    
                model.eval()
                with torch.no_grad():
                    loss_val = sum(loss_func(model(text_v, num_v), 
                                             target_v.float()) 
                                   for text_v, num_v, target_v in val_dl)
                   
                if(epoch%10==0):
                    print(epoch, float(loss_val / len(val_dl)))

        except Exception as e:
            exception = e
            

In [19]:
train_model(model, train_dl, val_dl)

0 0.27345046401023865
10 0.27333393692970276
20 0.27324193716049194
30 0.27322033047676086
40 0.278088241815567
50 0.2780504524707794
60 0.278089314699173
70 0.2781567871570587
80 0.27807751297950745
90 0.2780914008617401


In [20]:
def predict_test(model, test_dl):
    "Returns predictions over test_df."
    model.eval()
    preds = [model(xv1, xv2) for xv1, xv2, _ in test_dl][0]
    return preds
#     targs = [yv for _, _, yv in test_dl][0]
#     test_human_readable['targ_pred'] = preds.data.detach().numpy()
#     return torch.argmax(preds, dim=1).data.detach().numpy(), 
#            test_human_readable
preds = predict_test(model, test_dl)
print(preds)

tensor([[0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [1.7046e-34],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0