<a href="https://colab.research.google.com/github/Hernanros/SOTA/blob/master/notebooks/nonlinear/%5BSS%5Dbase_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os, urllib, glob, sys
from getpass import getpass

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
cmd_string = "! git clone https://{0}:{1}@github.com/Hernanros/SOTA".format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

%cd SOTA/data

User name: ShaulSolomon
Password: ··········
/content/SOTA/data


In [3]:
df = pd.read_csv("labeled_data.csv")

In [4]:
df.drop(columns=["Unnamed: 0","text_1","text_2"],inplace=True)

In [5]:
#Put label at the end of the df
cols = df.columns.to_list()
cols.remove("label")
cols.append("label")
df = df[cols]

In [6]:
class DS(Dataset):
    def __init__(self,df):
        super().__init__()
        self.df = np.array(df.iloc[:,:-1])
        self.labels = np.array(df.iloc[:,-1])

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        feat = self.df[idx,:]
        label = self.labels[idx]        

        return feat,label

In [7]:
class Basemodel(nn.Module):
  
  def __init__(self,n_feature,n_hidden,n_output, keep_probab = 0.1):
    '''
    input : tensor of dimensions (batch_size*n_feature)
    output: tensor of dimension (batchsize*1)
    '''
    super().__init__()
  
    self.input_dim = n_feature    
    self.hidden = nn.Linear(n_feature, n_hidden) 
    self.predict = torch.nn.Linear(n_hidden, n_output)
    self.dropout = nn.Dropout(keep_probab)
    # self.pool = nn.MaxPool2d(2, 2)
    # self.norm = nn.BatchNorm2d(self.num_filters)


  def forward(self, x):
    x = F.relu(self.dropout(self.hidden(x)))
    x = self.predict(x)
    return x

In [8]:
num_features = len(cols)-1
num_hl = 128
num_output = 1

DATA_SIZE = df.shape[0]
PERC_TRAIN = 0.8
PERC_TEST = 1 - PERC_TRAIN
TRAIN_SIZE = int(DATA_SIZE*PERC_TRAIN)
TEST_SIZE = DATA_SIZE - TRAIN_SIZE

model = Basemodel(num_features,num_hl,num_output)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [9]:
def train_epoch(tr_loader,model,criterion,optimizer, num_epochs):
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
    
    
    training_log =[]

    for epoch in range(num_epochs):
      print("started training epoch no. {}".format(epoch+1))
      tr_loss = 0
      for step,batch in enumerate(tr_loader):
            feats,labels = batch
            feats = feats.to(device,dtype=torch.float32)
            labels = labels.to(device,dtype=torch.float32)
            outputs = model(feats)
            loss = criterion(outputs, labels)
            loss.backward()
            tr_loss+=loss.item()
            optimizer.step()
            optimizer.zero_grad()

      training_log.append({
                'epoch':epoch,
                'train_loss':tr_loss / len(tr_loader),
                })
      
    return training_log
  
def test_evaluation(tst_loader,model,criterion): 
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
     
    model.eval()

    test_loss = 0

    for step,batch in enumerate(tst_loader):
        feats, labels = batch
      
        feats = feats.to(device,dtype=torch.float32)
        labels = labels.to(device,dtype=torch.float32)
        outputs = model(feats)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

    return test_loss / TEST_SIZE

In [10]:
rand_list = list(range(df.shape[0]))
np.random.seed(43)
np.random.shuffle(rand_list)
train_idx = rand_list[:int(len(rand_list)*PERC_TRAIN)]
test_idx = rand_list[int(len(rand_list)*PERC_TEST):]

In [11]:
train_set = DS(df.iloc[train_idx,:])
test_set = DS(df.iloc[test_idx,:])
train_loader=DataLoader(dataset= train_set, batch_size = 4, shuffle = True, num_workers = 2)
test_loader=DataLoader(dataset= test_set, batch_size = 4, shuffle = True, num_workers = 2)

In [12]:
train_epoch(train_loader,model,criterion,optimizer,num_epochs= 200)

started training epoch no. 1


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training epoch no. 35
started training 

[{'epoch': 0, 'train_loss': 0.8859830445051193},
 {'epoch': 1, 'train_loss': 0.6977964454889297},
 {'epoch': 2, 'train_loss': 0.6998442320525646},
 {'epoch': 3, 'train_loss': 0.6949249115586281},
 {'epoch': 4, 'train_loss': 0.6942613966763019},
 {'epoch': 5, 'train_loss': 0.6863221144676208},
 {'epoch': 6, 'train_loss': 0.6780826434493065},
 {'epoch': 7, 'train_loss': 0.6935897462069989},
 {'epoch': 8, 'train_loss': 0.6806354510784149},
 {'epoch': 9, 'train_loss': 0.6847915238142014},
 {'epoch': 10, 'train_loss': 0.6927295958995819},
 {'epoch': 11, 'train_loss': 0.6891531473398209},
 {'epoch': 12, 'train_loss': 0.6757026213407517},
 {'epoch': 13, 'train_loss': 0.6942389181256294},
 {'epoch': 14, 'train_loss': 0.6850704678893089},
 {'epoch': 15, 'train_loss': 0.7017373426258564},
 {'epoch': 16, 'train_loss': 0.6822302600741387},
 {'epoch': 17, 'train_loss': 0.674426079839468},
 {'epoch': 18, 'train_loss': 0.6770938548445702},
 {'epoch': 19, 'train_loss': 0.6790618096292019},
 {'epoch': 

In [None]:
test_evaluation(test_loader,model,criterion)

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


0.6450862105190754

In [None]:
df.columns[:-1]

Index(['POS dist score', '1-gram_overlap', 'chrf_score_norm', 'WMD',
       'ROUGE-1 recall', 'ROUGE-1 precision', 'ROUGE-1 F', 'ROUGE-2 recall',
       'ROUGE-2 precision', 'ROUGE-2 F', 'ROUGE-L recall', 'ROUGE-L precision',
       'ROUGE-L F', 'BertScore', 'L2_score'],
      dtype='object')

In [None]:
features = pd.DataFrame(df.columns[:-1], columns=["metric"])
features['weights'] = model.hidden.weight.data.cpu().numpy()[0,:]

In [None]:
features.sort_values(by=['weights'])

Unnamed: 0,metric,weights
0,POS dist score,-0.505415
3,WMD,-0.405754
14,L2_score,-0.279422
9,ROUGE-2 F,-0.214183
1,1-gram_overlap,-0.142205
4,ROUGE-1 recall,-0.110483
8,ROUGE-2 precision,-0.104712
10,ROUGE-L recall,-0.03612
11,ROUGE-L precision,-0.02366
6,ROUGE-1 F,-0.007301


## Get predictions

In [13]:
features = np.array(df.iloc[:,:-1])
labels = np.array(df.iloc[:,-1])

In [24]:
if torch.has_cuda:
  device = torch.device('cuda:0')
  model.to(device)
else:
  device = torch.device('cpu:0')

predictions = model(torch.tensor(features, dtype=torch.float32).to(device))
predictions  = predictions.cpu().detach().numpy()

In [26]:
df_pred = pd.read_csv("/content/SOTA/data/Paraphrase_labeled_data_with_predictions.csv")

In [31]:
df_pred['MLP predictions'] = pd.Series(predictions.reshape(-1))
df_pred.to_csv("Paraphrase_labeled_data_with_predictions_both.csv")

In [32]:

!git add ./Paraphrase_labeled_data_with_predictions_both.csv
!git config --global user.email "shaulsolomon@gmail.com"
!git config --global user.name "Shaul Solomon"
!git commit -m "Added MLP pred"
!git push

[master 5f54665] Added MLP pred
 1 file changed, 999 insertions(+)
 create mode 100644 data/Paraphrase_labeled_data_with_predictions_both.csv
Counting objects: 4, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 55.88 KiB | 5.08 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/Hernanros/SOTA
   6c10cc4..5f54665  master -> master
