In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

In [3]:
# !unzip /content/crawl-300d-2M.vec.zip

In [4]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

import io

import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import KFold

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Clean_Dataset.csv')

In [6]:
df

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,score kill pakistan clash,False,polar,worldnews
1,1201232075,2008-01-25,2,0,japan resume refuel mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,us press egypt gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,jumpstart economy give health care,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,council europe bash euun terror blacklist,False,mhermans,worldnews
...,...,...,...,...,...,...,...,...
509231,1479816764,2016-11-22,5,0,heil trump donald trump altright white nationa...,False,nonamenoglory,worldnews
509232,1479816772,2016-11-22,1,0,people speculate could madeleine mccann,False,SummerRay,worldnews
509233,1479817056,2016-11-22,1,0,professor receive arab researchers award,False,AUSharjah,worldnews
509234,1479817157,2016-11-22,1,0,nigel farage attack response trump ambassador ...,False,smilyflower,worldnews


# Supervised - Predicting number of up votes a post might get

In [7]:
MAX_LEN = 12
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 10

In [8]:
class Dataset:
  def __init__(self, title, up_votes):
    """
      :param title: this is a numpy array
      :param up_votes: a vector, numpy array
    """
    self.title = title
    self.up_votes = up_votes
    
  def __len__(self):
    # returns length of the dataset
    return len(self.title)
    
  def __getitem__(self, item):
    # for any given item, which is an int,
    # return review and targets as torch tensor
    # item is the index of the item in concern
    title = self.title[item, :]
    up_votes = self.up_votes[item]
    return {
      "title": torch.tensor(title, dtype=torch.long),
      "up_votes": torch.tensor(up_votes, dtype=torch.float)
    }

In [9]:
import torch
import torch.nn.functional as F

class Emb_CNN_LSTM_Model(torch.nn.Module):
    def __init__(self, input_dim=0, embedding_dim=12, hidden_dim=20, output_dim=1,
                 batch_size=8, num_layers=2, bidirectional=False, dropout=0):
        super().__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        self.fc_hidden_dim = self.hidden_dim

        if self.bidirectional:
            self.fc_hidden_dim = self.hidden_dim * 2

        self.embedding = nn.Embedding(self.input_dim, self.embedding_dim)

        self.conv1D1 = nn.Conv1d(embedding_dim, 32, 5)
        self.dropout1 = nn.Dropout(0.3)
        self.maxp1 = nn.MaxPool1d(5)

        self.dropout2 = nn.Dropout(0.4)
        self.conv1D2 = nn.Conv1d(32, 64, 5)
        self.maxp2 = nn.MaxPool1d(5)
        self.dropout3 = nn.Dropout(0.4)
        
        # LSTM
        self.lstm = nn.LSTM(64, 32)
        self.dropout4 = nn.Dropout(0.3)
        # linear
        self.dense = nn.Linear(128, 1)


    def forward(self, opcode):
        embedded = self.embedding(opcode)
        # CNN
        x = nn.Dropout(0.3)(embedded)
        # cnn_x = x.unsqueeze(1)
        cnn_x = self.conv1D1(x)
        cnn_x = torch.tanh(cnn_x) 
        cnn_x = self.dropout1(cnn_x)
        cnn_x = self.maxp1(cnn_x)
        cnn_x = self.dropout2(cnn_x)
        cnn_x = self.conv1D2(cnn_x)
        cnn_x = torch.tanh(cnn_x) 
        cnn_x = self.maxp2(cnn_x)
        
        cnn_x = torch.transpose(cnn_x, 1, 2)
        # LSTM
        lstm_out, _ = self.lstm(cnn_x)
        lstm_out = self.dropout4(torch.transpose(lstm_out, 1, 2).squeeze(2))
        # linear
        lstm_out = lstm_out.reshape((lstm_out.shape[0], -1, 1)).squeeze()
        cnn_lstm_out = self.dense(lstm_out)
        
        return cnn_lstm_out

In [10]:
def get_scheduler(optimizer, scheduler):
  if scheduler=='ReduceLROnPlateau':
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=4, verbose=True, eps=1e-6)
  elif scheduler=='CosineAnnealingLR':
    scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6, last_epoch=-1)
  elif scheduler=='CosineAnnealingWarmRestarts':
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)
  return scheduler

def train(data_loader, model, optimizer, device, scheduler):
  """
    This is the main training function that trains model
    for one epoch
    :param data_loader: this is the torch dataloader
    :param model: model (lstm model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param device: this can be "cuda" or "cpu"
  """
  # set model to training mode
  model.train()
  # go through batches of data in data loader
  for data in data_loader:
    # fetch review and target from the dict
    reviews = data["title"]
    up_votes = data["up_votes"]
    # move the data to device that we want to use
    reviews = reviews.to(device, dtype=torch.long)
    up_votes = up_votes.to(device, dtype=torch.float)
    # clear the gradients
    optimizer.zero_grad()
    # make predictions from the model
    predictions = model(reviews)
    # calculate the loss
    loss = nn.L1Loss()(
      predictions,
      up_votes.view(-1, 1)
    )
    # compute gradient of loss w.r.t.
    # all parameters of the model that are trainable
    loss.backward()
    # single optimization step
    optimizer.step()
  scheduler.step()
  
def evaluate(data_loader, model, device):
  # initialize empty lists to store predictions
  # and targets
  final_predictions = []
  final_targets = []
  # put the model in eval mode
  model.eval()
  # disable gradient calculation
  with torch.no_grad():
    for data in data_loader:
      title = data["title"]
      up_votes = data["up_votes"]
      title = title.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      # make predictions
      predictions = model(title)
      # move predictions and targets to list
      # we need to move predictions and targets to cpu too
      predictions = predictions.cpu().numpy().tolist()
      up_votes = data["up_votes"].cpu().numpy().tolist()
      final_predictions.extend(predictions)
      final_targets.extend(targets)
  
  # return final predictions and targets
  return final_predictions, final_targets

In [11]:
def load_vectors(fname):
  # taken from: https://fasttext.cc/docs/en/english-vectors.html
  fin = io.open(
    fname,
    'r',
    encoding='utf-8',
    newline='\n',
    errors='ignore'
  )
  n, d = map(int, fin.readline().split())
  data = {}
  for line in fin:
    tokens = line.rstrip().split(' ')
    data[tokens[0]] = list(map(float, tokens[1:]))
  return data

def create_embedding_matrix(word_index, embedding_dict):
  """
    This function creates the embedding matrix.
    :param word_index: a dictionary with word:index_value
    :param embedding_dict: a dictionary with word:embedding_vector
    :return: a numpy array with embedding vectors for all known words
  """
  # initialize matrix with zeros
  embedding_matrix = np.zeros((len(word_index) + 1, 300))
  # loop over all the words
  for word, i in word_index.items():
    # if word is found in pre-trained embeddings,
    # update the matrix. if the word is not found,
    # the vector is zeros!
    if word in embedding_dict:
      embedding_matrix[i] = embedding_dict[word]
    
  # return embedding matrix
  return embedding_matrix

def load_embeddings(word_index, embedding_file, vector_length=300):
  """
    A general function to create embedding matrix
    :param word_index: word:index dictionary
    :param embedding_file: path to embeddings file
    :param vector_length: length of vector
  """
  max_features = len(word_index) + 1
  words_to_find = list(word_index.keys())
  more_words_to_find = []
  
  for wtf in words_to_find:
    more_words_to_find.append(wtf)
    more_words_to_find.append(str(wtf).capitalize())
  more_words_to_find = set(more_words_to_find)
  
  def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
  
  embeddings_index = dict(
    get_coefs(*o.strip().split(" "))
    for o in open(embedding_file)
    if o.split(" ")[0]
    in more_words_to_find
    and len(o) > 100
  )
  embedding_matrix = np.zeros((max_features, vector_length))
  for word, i in word_index.items():
    if i >= max_features:
      continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
      embedding_vector = embeddings_index.get(
        str(word).capitalize()
      )
    if embedding_vector is None:
      embedding_vector = embeddings_index.get(
        str(word).upper()
      )
    if (embedding_vector is not None
      and len(embedding_vector) == vector_length):
        embedding_matrix[i] = embedding_vector
        
  return embedding_matrix

In [12]:
def run(train_df, valid_df, fold):
  """
    Run training and validation for a given fold
    and dataset
    :param df: pandas dataframe with kfold column
    :param fold: current fold, int
  """
  
  print("Fitting tokenizer")
  # we use tf.keras for tokenization
  # you can use your own tokenizer and then you can
  # get rid of tensorflow
  tokenizer = tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(df.title.values.tolist())
  
  # convert training data to sequences
  # for example : "bad movie" gets converted to
  # [24, 27] where 24 is the index for bad and 27 is the
  # index for movie
  xtrain = tokenizer.texts_to_sequences(train_df.title.values)
  
  # similarly convert validation data to
  # sequences
  xtest = tokenizer.texts_to_sequences(valid_df.title.values)
  
  # zero pad the training sequences given the maximum length
  # this padding is done on left hand side
  # if sequence is > MAX_LEN, it is truncated on left hand side too
  xtrain = tf.keras.preprocessing.sequence.pad_sequences(
    xtrain, maxlen=MAX_LEN
  )
  
  # zero pad the validation sequences
  xtest = tf.keras.preprocessing.sequence.pad_sequences(
    xtest, maxlen=MAX_LEN
  )
  
  # initialize dataset class for training
  train_dataset = Dataset(
    title=xtrain,
    up_votes=train_df.up_votes.values
  )
  # create torch dataloader for training
  # torch dataloader loads the data using dataset
  # class in batches specified by batch size
  train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
  )
  # initialize dataset class for validation
  valid_dataset = Dataset(
    title=xtest,
    up_votes=valid_df.up_votes.values
  )

  # create torch dataloader for validation
  valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
  )
  # Skipping below because it is eating all RAM even with batch size = 1
  # print("Loading embeddings")
  # # load embeddings as shown previously
  # embedding_dict = load_vectors("/content/crawl-300d-2M.vec")
  # embedding_matrix = create_embedding_matrix(
  #   tokenizer.word_index, embedding_dict
  # )
  # create torch device, since we use gpu, we are using cuda
  device = torch.device("cuda")
  # fetch our LSTM model
  model = Emb_CNN_LSTM_Model()
  # send model to device
  model.to(device)

  # initialize Adam optimizer
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  scheduler = get_scheduler(optimizer, 'CosineAnnealingWarmRestarts')
  print("Training Model")
  # set best accuracy to zero
  best_mae = 0
  # set early stopping counter to zero
  early_stopping_counter = 0
  # train and validate for all epochs
  for epoch in range(EPOCHS):
    # train one epoch
    train(train_data_loader, model, optimizer, device, scheduler)
    # validate
    outputs, targets = evaluate(
      valid_data_loader, model, device
    )
    # use threshold of 0.5
    # please note we are using linear layer and no sigmoid
    # you should do this 0.5 threshold after sigmoid
    # calculate accuracy
    mae = metrics.mean_squared_error(targets, outputs)
    print(
      f"FOLD:{fold}, Epoch: {epoch}, mae = {mae}"
    )
    # simple early stopping
    if mae < best_mae:
      best_mae = mae
    else:
      early_stopping_counter += 1
      
    if early_stopping_counter > 2:
      break

In [13]:
df['title'] = df['title'].astype(str)

In [None]:
fold = 0
for tr_in, val_in in KFold().split(df[['title', 'up_votes']]):
  tr = df.loc[tr_in, ['title', 'up_votes']].reset_index(drop=True)
  val = df.loc[val_in, ['title', 'up_votes']].reset_index(drop=True)
  run(tr, val, fold)
  fold += 1

# Note

I have not trained it because I don't have GPU, the dataset is huge and the model I have created is big too but once I have access to a good machine, I can run it.

Tried on Google colab, due to limitations getting out of memory error even with batch size = 1.
Other than that everything is fine.