In [None]:
from google.colab import drive
import os
import json
import torch
from torch import nn
import numpy as np
import random
random.seed(12382592)

if not os.path.isfile('train.jl.parsed.txt'):
  !wget https://www.dropbox.com/s/z0lx7zbsz1lct76/train.jl.parsed.txt
if not os.path.isfile('dev.jl.parsed.txt'):
  !wget https://www.dropbox.com/s/r29u2lfvagaq48t/dev.jl.parsed.txt
if not os.path.isfile('test.jl.parsed.txt'):
  !wget https://www.dropbox.com/s/3eqx58oe49bwe69/test.jl.parsed.txt

BIN_MODE = [1,0,0]
CNN_MODE = [0,1,0]
LSTM_MODE = [0,0,1]

In [None]:
#OUR_MODE = BIN_MODE
OUR_MODE = CNN_MODE
#OUR_MODE = LSTM_MODE


DOING_BIN, DOING_CNN, DOING_LSTM = OUR_MODE

In [None]:
def loadDataSet(path):
  with open(path, 'r') as f:
    for l in f:
      obj = json.loads(l)
      obj['posts'] = list(sorted(obj['posts'], key=lambda o: int(o['date'])))
      yield obj 

In [None]:
categories = { 'surprise': 0, 'sadness': 1, 'joy': 2, 'love': 3, 'anger': 4, 'fear': 5 }
labels = {'control': 0, 'depression': 1}
categories_rev = {i:w for w,i in categories.items()}
labels_rev = {i:w for w,i in labels.items()}

In [None]:
train = list(filter(lambda o: len(o['posts']) > 25, loadDataSet('train.jl.parsed.txt')))
dev = list(filter(lambda o: len(o['posts']) > 25, loadDataSet('dev.jl.parsed.txt')))
test = list(filter(lambda o: len(o['posts']) > 25, loadDataSet('test.jl.parsed.txt')))

In [None]:
# evenly distribute the train
all_1_in_train = [obj for obj in train if obj['label']]
# how many do we want to add? just check how many over the 0s are
total_to_add = len(train) - 2 * len(all_1_in_train)
# how many times just add it?
while total_to_add >= len(all_1_in_train):
    train += all_1_in_train
    total_to_add -= len(all_1_in_train)
# sample the rest
train += random.sample(all_1_in_train, total_to_add)

In [None]:
def bin_list(l):
    if not TOTAL_BINS:
      for x in l: yield x
    else: 
      size = int(len(l) / TOTAL_BINS)
      for i in range(TOTAL_BINS):
        r = l[size * i:size * (i + 1)]
        if len(r) < size: r += [FILLER] * (size - len(r))
        yield r

def run_batch(batch):
    # we have a bunch of objects with a 'posts' object
    # embed them all
    # first, convert to list of list of ints (meaning, for each object in the batch, get the list of categories, already sorted)
    enc = [[post['cat'] for post in obj['posts']] for obj in batch]
    # next, bin it
    enc = [list(bin_list(l)) for l in enc]

    # run through the model!
    return model(enc)

def batchify(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def acc(res, batch):
  # parse the gold
  gold = np.array([obj['label'] for obj in batch])
  preds = res.detach().numpy()[:,1]# take the "depression" value for each item in the batch
  # create the return variables - for every threshold
  corr = []
  corr_dep = []
  pred_dep = []
  gold_dep = [np.sum(gold)] * len(thresholds) # doesn't change through thresholds
  # get the predictions, the gold, and compare them, for each threshold
  for thresh in thresholds:
    cur_preds = (preds > (thresh / 100)) * 1 # check if it's above our threshold, and then multiply by 1 to get numbers instead of true/false
    # compare
    comp = cur_preds == gold
    # add results
    corr.append(np.sum(comp))
    corr_dep.append(np.sum(gold & comp))
    pred_dep.append(np.sum(cur_preds))

  # return how many correct, how many correct dep (meaning, 1 in comp=correct, 1 in gold=dep, so how often they're equal), how many predicted dep, how many dep in general
  return corr, corr_dep, pred_dep, gold_dep
  #return np.sum(comp), np.sum(gold & comp), np.sum(preds), np.sum(gold)

def print_acc_str(total, all_correct, all_dep_correct, all_dep_pred, all_dep):
    ret_strs = []
    for thresh,total_correct,total_dep_correct,total_dep_pred,total_dep in zip(thresholds, all_correct, all_dep_correct, all_dep_pred, all_dep):
      ret_strs.append(f'Thresh={thresh}:  Acc: {round(total_correct / total * 100, 2)}% ({total_correct}/{total}),  Prec: {round(total_dep_correct / total_dep_pred * 100, 2)}% ({total_dep_correct}/{total_dep_pred}), Recall: {round(total_dep_correct / total_dep * 100, 2)}% ({total_dep_correct}/{total_dep})')
    return '\n\t' + '\n\t'.join(ret_strs)

def run_val(data, suf='Val'):
    model.eval()
    with torch.no_grad():
      total_correct = np.zeros(len(thresholds))
      total_dep_correct = np.zeros(len(thresholds))
      total_dep_pred = np.zeros(len(thresholds))
      total_dep = np.zeros(len(thresholds))
      overall_loss = 0

      # go through each batch
      seen = 0
      batches_seen = 0
      for batch in batchify(data, n=BATCH_SIZE):
          seen += len(batch)
          batches_seen += 1
          print(f'  {seen}/{len(data)}   \r', end='')
          # run through
          res = run_batch(batch)
          cur_acc = acc(res, batch)
          total_correct += cur_acc[0]
          total_dep_correct += cur_acc[1]
          total_dep_pred += cur_acc[2]
          total_dep += cur_acc[3]
          # cross_entropy loss and update!
          overall_loss += loss(res, torch.tensor([obj['label'] for obj in batch])).detach().numpy()
      print(f' {suf}: {print_acc_str(seen, total_correct, total_dep_correct, total_dep_pred, total_dep)},   Loss: {overall_loss / batches_seen}')

In [None]:
class Embeddings(nn.Module):
    def __init__(self, dim, vocab):
        super(Embeddings, self).__init__()
        self.lp = nn.Embedding(vocab, dim)

    # accepts some tensor of inputs, returns their embeddings  
    def forward(self, x):
        #return torch.stack([torch.cat([torch.mean(self.lp(torch.tensor(subx)), dim=0) for subx in b]) for b in x])
        return self.lp(torch.tensor(x))

class BinEncoder(nn.Module):
  def __init__(self):
    super(BinEncoder, self).__init__()
  
  # accepts a tensor of size (N, B, X, E) where is N is batch size (right now has to be 1), B is the number of bins, X is the bin size, E is the embedding size
  # returns a tensor of size (N, B * E) where the vector X is averaged, and then the vector B is concatenated 
  def forward(self, x):
      return torch.flatten(torch.mean(x, dim=2), 1, 2)

class CNNEncoder(nn.Module):
    def __init__(self, embed, kernels=[5, 3]):
      super(CNNEncoder, self).__init__()
      self.conv = [nn.Conv1d(EMBED_DIM, EMBED_DIM, k) for k in kernels]
    
    # accepts a tensor of size (N, X, E) where N=batch size, X=timepoints, E=embed size
    # returns a tensor of size (N, E' * K') where E' is the output channels and K' is the number of different kernels
    def forward(self, x):
      # permute so we run the convolution on the right dimension
      x = x.permute(0, 2, 1)
      # run the convolution
      new_x = [torch.relu(c(x)) for c in self.conv]
      # max-pool it
      new_x = [torch.max(sub_x, 2, keepdim=True)[0] for sub_x in new_x]
      # concatenate and flatten  
      union = torch.cat(new_x, 2)
      union = union.reshape(union.size(0), -1)
      return union

class LSTMEncoder(nn.Module):
    def __init__(self, in_embed, out_embed):
        super(LSTMEncoder, self).__init__()
        self.rnn = nn.LSTM(in_embed, out_embed, 2, batch_first=True)
        # todo netanel: initiate an LSTM here
      
      # accepts a tensor of size (N, X, E) where N=batch size, X=timepoints, E=in embed size
      # returns a tensor of size (N, E') where E' is the output embed size
    def forward(self, x):
        _, (hn, _) = self.rnn(x)
        return hn[-1]

class MLP2L(nn.Module):
    def __init__(self, dim, hid1,hid2, out):
        super(MLP2L, self).__init__()
        self.ll1 = nn.Linear(dim, hid1)
        self.ll2 = nn.Linear(hid1, hid2)
        self.ll3 = nn.Linear(hid2, out)

        self.do0 = nn.Dropout(p=0.2)
        self.do1 = nn.Dropout(p=0.2)
        self.do2 = nn.Dropout(p=0.2)
      
    def forward(self, x):
        #x = self.do0(x)
        x = self.ll1(x)
        x = torch.tanh(x)
        #x = self.do1(x)
        x = self.ll2(x)
        x = torch.tanh(x)
        #x = self.do2(x)
        x = self.ll3(x)
        return nn.functional.softmax(x)



In [None]:
VOCAB_SIZE = len(categories) + 1 #filler
FILLER = len(categories)
EMBED_DIM = 20
TOTAL_BINS = 10 if DOING_BIN else 0
HIDDEN_DIM1 = 30
HIDDEN_DIM2 = 10

OUTPUT_DIM = len(labels)
# build our model
if DOING_BIN:
  model = nn.Sequential(Embeddings(EMBED_DIM, VOCAB_SIZE), BinEncoder(), MLP2L(EMBED_DIM * TOTAL_BINS, HIDDEN_DIM1, HIDDEN_DIM2, OUTPUT_DIM))
if DOING_CNN:
  # todo netanel: play with kernels - number and count
  model = nn.Sequential(Embeddings(EMBED_DIM, VOCAB_SIZE), CNNEncoder(EMBED_DIM, [3,3,3]), MLP2L(EMBED_DIM * 3, HIDDEN_DIM1, HIDDEN_DIM2,  OUTPUT_DIM))
if DOING_LSTM:
  # todo netanel: set output embed better
  model = nn.Sequential(Embeddings(EMBED_DIM, VOCAB_SIZE), LSTMEncoder(EMBED_DIM, EMBED_DIM * 2), MLP2L(EMBED_DIM * 2, HIDDEN_DIM1, HIDDEN_DIM2, OUTPUT_DIM))


optim = torch.optim.Adam(model.parameters(),lr=0.001)
loss = nn.CrossEntropyLoss()

print(len(train), sum(obj['label'] for obj in train))

thresholds = np.arange(1, 10) * 10

BATCH_SIZE = 1 #more than 1 doesn't work at the moment, since we don't pad
for epoch in range(40):
    # shuffle the train
    random.shuffle(train)

    model.train()

    total_seen = 0
    total_correct = np.zeros(len(thresholds))
    total_dep_correct = np.zeros(len(thresholds))
    total_dep_pred = np.zeros(len(thresholds))
    total_dep = np.zeros(len(thresholds))
    overall_loss = 0
    # go through each batch
    for batch in batchify(train, n=BATCH_SIZE):
        optim.zero_grad()
        # run through
        res = run_batch(batch)

        # print some info
        total_seen += len(batch)
        #print(f'{total_seen}/{len(train)}, loss: ', cur_loss.detach().numpy())
        cur_acc = acc(res, batch)
        total_correct += cur_acc[0]
        total_dep_correct += cur_acc[1]
        total_dep_pred += cur_acc[2]
        total_dep += cur_acc[3]
        
        # cross_entropy loss and update!
        cur_loss = loss(res, torch.tensor([obj['label'] for obj in batch]))
        
        #cur_loss = cur_loss * weights
        cur_loss.backward()
        optim.step()

        overall_loss += cur_loss.detach().numpy()
        if total_seen % 1000 == 0:
            print('loss: ', overall_loss / 1000)
            overall_loss = 0
    
    print(f' Finished epoch, average {print_acc_str(total_seen, total_correct, total_dep_correct, total_dep_pred, total_dep)})')
    run_val(dev)
    run_val(test, 'Test')

8488 4244




loss:  0.6952391835451126
loss:  0.6855512961149216
loss:  0.686567260235548
loss:  0.6758496512770653
loss:  0.6859613520205021
loss:  0.6855971951186657
loss:  0.6834656703174115
loss:  0.6894517486393452




 Finished epoch, average 
	Thresh=10:  Acc: 50.0% (4244.0/8488),  Prec: 50.0% (4244.0/8488.0), Recall: 100.0% (4244.0/4244.0)
	Thresh=20:  Acc: 49.95% (4240.0/8488),  Prec: 49.98% (4239.0/8482.0), Recall: 99.88% (4239.0/4244.0)
	Thresh=30:  Acc: 50.45% (4282.0/8488),  Prec: 50.23% (4067.0/8096.0), Recall: 95.83% (4067.0/4244.0)
	Thresh=40:  Acc: 53.78% (4565.0/8488),  Prec: 52.61% (3238.0/6155.0), Recall: 76.3% (3238.0/4244.0)
	Thresh=50:  Acc: 55.75% (4732.0/8488),  Prec: 56.25% (2195.0/3902.0), Recall: 51.72% (2195.0/4244.0)
	Thresh=60:  Acc: 55.25% (4690.0/8488),  Prec: 57.79% (1654.0/2862.0), Recall: 38.97% (1654.0/4244.0)
	Thresh=70:  Acc: 52.29% (4438.0/8488),  Prec: 59.36% (615.0/1036.0), Recall: 14.49% (615.0/4244.0)
	Thresh=80:  Acc: 50.33% (4272.0/8488),  Prec: 58.24% (99.0/170.0), Recall: 2.33% (99.0/4244.0)
	Thresh=90:  Acc: 50.0% (4244.0/8488),  Prec: nan% (0.0/0.0), Recall: 0.0% (0.0/4244.0))
 Val: 
	Thresh=10:  Acc: 22.25% (1223.0/5497),  Prec: 22.25% (1223.0/5497.0), Re