<a href="https://colab.research.google.com/github/Hernanros/SOTA/blob/master/notebooks/nonlinear/%5BSS%5Dbase_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import os, urllib, glob, sys
from getpass import getpass

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
cmd_string = "! git clone https://{0}:{1}@github.com/Hernanros/SOTA".format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

%cd SOTA/data

User name: shaulsolomon
Password: ··········
/content/SOTA/data


In [4]:
df = pd.read_csv("labeled_data.csv")

In [5]:
df.drop(columns=["Unnamed: 0","text_1","text_2"],inplace=True)

In [6]:
#Put label at the end of the df
cols = df.columns.to_list()
cols.remove("label")
cols.append("label")
df = df[cols]

In [7]:
class DS(Dataset):
    def __init__(self,df):
        super().__init__()
        self.df = np.array(df.iloc[:,:-1])
        self.labels = np.array(df.iloc[:,-1])

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        feat = self.df[idx,:]
        label = self.labels[idx]        

        return feat,label

In [8]:
class Basemodel(nn.Module):
  
  def __init__(self,n_feature,n_hidden,n_output, keep_probab = 0.1):
    '''
    input : tensor of dimensions (batch_size*n_feature)
    output: tensor of dimension (batchsize*1)
    '''
    super().__init__()
  
    self.input_dim = n_feature    
    self.hidden = nn.Linear(n_feature, n_hidden) 
    self.predict = torch.nn.Linear(n_hidden, n_output)
    self.dropout = nn.Dropout(keep_probab)
    # self.pool = nn.MaxPool2d(2, 2)
    # self.norm = nn.BatchNorm2d(self.num_filters)


  def forward(self, x):
    x = F.relu(self.dropout(self.hidden(x)))
    x = self.predict(x)
    return x

In [85]:
num_features = len(cols)-1
num_hl = 128
num_output = 1

DATA_SIZE = df.shape[0]
PERC_TRAIN = 0.8
PERC_TEST = 1 - PERC_TRAIN
TRAIN_SIZE = int(DATA_SIZE*PERC_TRAIN)
TEST_SIZE = DATA_SIZE - TRAIN_SIZE

model = Basemodel(num_features,num_hl,num_output)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [86]:
def train_epoch(tr_loader,model,criterion,optimizer, num_epochs):
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
    
    
    training_log =[]

    for epoch in range(num_epochs):
      print("started training epoch no. {}".format(epoch+1))
      tr_loss = 0
      for step,batch in enumerate(tr_loader):
            feats,labels = batch
            feats = feats.to(device,dtype=torch.float32)
            labels = labels.to(device,dtype=torch.float32)
            outputs = model(feats)
            loss = criterion(outputs, labels)
            loss.backward()
            tr_loss+=loss.item()
            optimizer.step()
            optimizer.zero_grad()

      training_log.append({
                'epoch':epoch,
                'train_loss':tr_loss / len(tr_loader),
                })
      
    return training_log
  
def test_evaluation(tst_loader,model,criterion): 
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
     
    model.eval()

    test_loss = 0

    for step,batch in enumerate(tst_loader):
        feats, labels = batch
      
        feats = feats.to(device,dtype=torch.float32)
        labels = labels.to(device,dtype=torch.float32)
        outputs = model(feats)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

    return test_loss / TEST_SIZE

In [87]:
rand_list = list(range(df.shape[0]))
np.random.seed(43)
np.random.shuffle(rand_list)
train_idx = rand_list[:int(len(rand_list)*PERC_TRAIN)]
test_idx = rand_list[int(len(rand_list)*PERC_TEST):]

In [88]:
train_set = DS(df.iloc[train_idx,:])
test_set = DS(df.iloc[test_idx,:])
train_loader=DataLoader(dataset= train_set, batch_size = 4, shuffle = True, num_workers = 2)
test_loader=DataLoader(dataset= test_set, batch_size = 4, shuffle = True, num_workers = 2)

In [89]:
train_epoch(train_loader,model,criterion,optimizer,num_epochs= 10)

started training epoch no. 1


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10


[{'epoch': 0, 'train_loss': 0.8271152579784393},
 {'epoch': 1, 'train_loss': 0.703391557186842},
 {'epoch': 2, 'train_loss': 0.701278246641159},
 {'epoch': 3, 'train_loss': 0.6899133661389351},
 {'epoch': 4, 'train_loss': 0.683928554803133},
 {'epoch': 5, 'train_loss': 0.7032977233827115},
 {'epoch': 6, 'train_loss': 0.6848453582823276},
 {'epoch': 7, 'train_loss': 0.6789194586873054},
 {'epoch': 8, 'train_loss': 0.6923054607212543},
 {'epoch': 9, 'train_loss': 0.6752397535741329}]

In [90]:
test_evaluation(test_loader,model,criterion)

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


0.6642314529418946

In [73]:
df.columns[:-1]

Index(['POS dist score', '1-gram_overlap', 'chrf_score_norm', 'WMD',
       'ROUGE-1 recall', 'ROUGE-1 precision', 'ROUGE-1 F', 'ROUGE-2 recall',
       'ROUGE-2 precision', 'ROUGE-2 F', 'ROUGE-L recall', 'ROUGE-L precision',
       'ROUGE-L F', 'BertScore', 'L2_score'],
      dtype='object')

In [82]:
features = pd.DataFrame(df.columns[:-1], columns=["metric"])
features['weights'] = model.hidden.weight.data.cpu().numpy()[0,:]

In [84]:
features.sort_values(by=['weights'])

Unnamed: 0,metric,weights
5,ROUGE-1 precision,-0.252879
8,ROUGE-2 precision,-0.167672
3,WMD,-0.157324
9,ROUGE-2 F,-0.103899
6,ROUGE-1 F,-0.032104
4,ROUGE-1 recall,0.032366
2,chrf_score_norm,0.036325
14,L2_score,0.079129
13,BertScore,0.095431
0,POS dist score,0.122724
