<a href="https://colab.research.google.com/github/Hernanros/SOTA/blob/master/notebooks/nonlinear/%5BSS%5Dprediction%20scores%20for%20whole%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
import pandas as pd
import numpy as np
import os, urllib, glob, sys
from getpass import getpass
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing

In [2]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
cmd_string = "! git clone https://{0}:{1}@github.com/Hernanros/SOTA".format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

%cd SOTA/data

User name: ShaulSolomon
Password: ··········
/content/SOTA/data


In [113]:
df = pd.read_csv("/content/SOTA/data/combined_data_with_predictions_on_separate_datasets.csv")

In [114]:
df.head(1)

Unnamed: 0,dataset,label,text_1,text_2,bleu_allwords,bleu_withoutstop,glove_allwords,glove_withoutstop,ftext_allwords,ftext_withoutstop,WMD,1-gram_overlap,2-gram_overlap,3-gram_overlap,4-gram_overlap,ROUGE-1 recall,ROUGE-1 precision,ROUGE-1 F,ROUGE-2 recall,ROUGE-2 precision,ROUGE-2 F,ROUGE-L recall,ROUGE-L precision,ROUGE-L F,chrf_score,chrf_score_norm,POS dist score,text_1_tokens,text_2_tokens,L2_score,bert,Predictions
0,2012.MSRpar.test.tsv,4.4,The problem likely will mean corrective change...,He said the problem needs to be corrected befo...,0.375739,0.333333,96.2,90.82,77.23,77.39,3,0.4,0.133333,0.0,0.0,0.466667,0.368421,0.411765,0.214286,0.166667,0.1875,0.466667,0.368421,0.411765,0.536815,2.684077,3.055075,"['The', 'problem', 'likely', 'will', 'mean', '...","['He', 'said', 'the', 'problem', 'needs', 'to'...",10.527886,0.926813,2.988756


# Running MLP Model on the entire dataset

In [140]:
df_all = df.drop(columns=["dataset","text_1","text_2","text_1_tokens","text_2_tokens","Predictions"])

In [141]:
#Put label at the end of the df
cols = df_all.columns.to_list()
cols.remove("label")
cols.append("label")
df_all = df_all[cols]

In [142]:
len(df_all.columns)

26

In [143]:
class DS(Dataset):
    def __init__(self,df):
        super().__init__()
        X = df.iloc[:,:-1] 
        column_names = list(X.columns) 
        x = X.values #returns a numpy array
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 5))
        x_scaled = min_max_scaler.fit_transform(x)
        self.df = np.array(pd.DataFrame(x_scaled, columns=column_names))
        self.labels = np.array(df.iloc[:,-1])

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        feat = self.df[idx,:]
        label = self.labels[idx]        

        return feat,label

In [144]:
class Basemodel(nn.Module):
  
  def __init__(self,n_feature,n_hidden,n_output, keep_probab = 0.1):
    '''
    input : tensor of dimensions (batch_size*n_feature)
    output: tensor of dimension (batchsize*1)
    '''
    super().__init__()
  
    self.input_dim = n_feature    
    self.hidden = nn.Linear(n_feature, n_hidden) 
    self.predict = torch.nn.Linear(n_hidden, n_output)
    self.dropout = nn.Dropout(keep_probab)
    # self.pool = nn.MaxPool2d(2, 2)
    # self.norm = nn.BatchNorm2d(self.num_filters)


  def forward(self, x):
    x = F.relu(self.dropout(self.hidden(x)))
    x = self.predict(x)
    return x

In [145]:
num_features = len(cols)-1
num_hl = 128
num_output = 1

DATA_SIZE = df_all.shape[0]
PERC_TRAIN = 0.8
PERC_TEST = 1 - PERC_TRAIN
TRAIN_SIZE = int(DATA_SIZE*PERC_TRAIN)
TEST_SIZE = DATA_SIZE - TRAIN_SIZE

model = Basemodel(num_features,num_hl,num_output)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [146]:
def train_epoch(tr_loader,model,criterion,optimizer, num_epochs):
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
    
    
    training_log =[]

    for epoch in range(num_epochs):
      print("started training epoch no. {}".format(epoch+1))
      tr_loss = 0
      for step,batch in enumerate(tr_loader):
            feats,labels = batch
            feats = feats.to(device,dtype=torch.float32)
            labels = labels.to(device,dtype=torch.float32)
            outputs = model(feats)
            loss = criterion(outputs, labels)
            loss.backward()
            tr_loss+=loss.item()
            optimizer.step()
            optimizer.zero_grad()

      training_log.append({
                'epoch':epoch,
                'train_loss':tr_loss / len(tr_loader),
                })
      
    return training_log
  
def test_evaluation(tst_loader,model,criterion): 
    if torch.has_cuda:
      device = torch.device('cuda:0')
      model.to(device)
    else:
      device = torch.device('cpu:0')
     
    model.eval()

    test_loss = 0

    for step,batch in enumerate(tst_loader):
        feats, labels = batch
      
        feats = feats.to(device,dtype=torch.float32)
        labels = labels.to(device,dtype=torch.float32)
        outputs = model(feats)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

    return test_loss / TEST_SIZE

In [147]:
rand_list = list(range(df_all.shape[0]))
np.random.seed(42)
np.random.shuffle(rand_list)
train_idx = rand_list[:int(len(rand_list)*PERC_TRAIN)]
test_idx = rand_list[int(len(rand_list)*PERC_TEST):]

In [148]:
train_set = DS(df_all.iloc[train_idx,:])
test_set = DS(df_all.iloc[test_idx,:])
train_loader=DataLoader(dataset= train_set, batch_size = 4, shuffle = True, num_workers = 2)
test_loader=DataLoader(dataset= test_set, batch_size = 4, shuffle = True, num_workers = 2)

In [149]:
train_epoch(train_loader,model,criterion,optimizer,num_epochs= 100)

started training epoch no. 1


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training epoch no. 35
started training 

[{'epoch': 0, 'train_loss': 1.2260447753358859},
 {'epoch': 1, 'train_loss': 1.2201950534463193},
 {'epoch': 2, 'train_loss': 1.2139071656850609},
 {'epoch': 3, 'train_loss': 1.2135979788737994},
 {'epoch': 4, 'train_loss': 1.2092380053581566},
 {'epoch': 5, 'train_loss': 1.2106864707545568},
 {'epoch': 6, 'train_loss': 1.2085914891411371},
 {'epoch': 7, 'train_loss': 1.2062474128365206},
 {'epoch': 8, 'train_loss': 1.2080143700340527},
 {'epoch': 9, 'train_loss': 1.2047713756010825},
 {'epoch': 10, 'train_loss': 1.2057216532066197},
 {'epoch': 11, 'train_loss': 1.2047072243759895},
 {'epoch': 12, 'train_loss': 1.2028801119894752},
 {'epoch': 13, 'train_loss': 1.2022913705630132},
 {'epoch': 14, 'train_loss': 1.2007414274511508},
 {'epoch': 15, 'train_loss': 1.2009135119780028},
 {'epoch': 16, 'train_loss': 1.1987184838965292},
 {'epoch': 17, 'train_loss': 1.1972668147654304},
 {'epoch': 18, 'train_loss': 1.1980588520609503},
 {'epoch': 19, 'train_loss': 1.1965154926716097},
 {'epoch':

In [1]:
all_dataset_loss = test_evaluation(test_loader,model,criterion)

NameError: ignored

In [151]:
all_dataset_loss

1.1900177375185181

In [152]:
features = np.array(df_all.iloc[:,:-1])
labels = np.array(df_all.iloc[:,-1])

In [153]:
if torch.has_cuda:
  device = torch.device('cuda:0')
  model.to(device)
else:
  device = torch.device('cpu:0')

predictions = model(torch.tensor(features, dtype=torch.float32).to(device))
predictions  = predictions.cpu().detach().numpy()

In [154]:
df_pred = df.copy()
df_pred['Predictions_MLP'] = pd.Series(predictions.reshape(-1))
df_pred.to_csv("combined_data_with_predictions_both.csv")

In [155]:
!git add ./combined_data_with_predictions_both.csv
!git config --global user.email "shaulsolomon@gmail.com"
!git config --global user.name "Shaul Solomon"
!git commit -m "Added MLP pred to whole dataset with linear pred"
!git push

[master d6f0e3e] Added MLP pred to whole dataset with linear pred
 1 file changed, 24902 insertions(+), 24902 deletions(-)
To https://github.com/Hernanros/SOTA
 ! [rejected]        master -> master (fetch first)
error: failed to push some refs to 'https://ShaulSolomon:iamaHippo1492@github.com/Hernanros/SOTA'
hint: Updates were rejected because the remote contains work that you do
hint: not have locally. This is usually caused by another repository pushing
hint: to the same ref. You may want to first integrate the remote changes
hint: (e.g., 'git pull ...') before pushing again.
hint: See the 'Note about fast-forwards' in 'git push --help' for details.


# For each dataset seperately

In [156]:
df_each = df.drop(columns=["text_1","text_2","text_1_tokens","text_2_tokens","Predictions"]).groupby("dataset")

In [None]:
log = {}
all_pred = {}

for name, df_group_each in df_each:

  df_group = df_group_each.copy()
  df_group.drop(columns="dataset",inplace=True)
  cols = df_group.columns.to_list()
  cols.remove("label")
  cols.append("label")
  df_group = df_group[cols]

  num_features = df_group.shape[1] - 1
  num_hl = 128
  num_output = 1

  DATA_SIZE = df_group.shape[0]
  PERC_TRAIN = 0.8
  PERC_TEST = 1 - PERC_TRAIN
  TRAIN_SIZE = int(DATA_SIZE*PERC_TRAIN)
  TEST_SIZE = DATA_SIZE - TRAIN_SIZE

  model = Basemodel(num_features,num_hl,num_output)
  criterion = nn.L1Loss()
  optimizer = optim.Adam(model.parameters(), lr=1e-3)

  rand_list = list(range(df_group.shape[0]))
  np.random.seed(42)
  np.random.shuffle(rand_list)
  train_idx = rand_list[:int(len(rand_list)*PERC_TRAIN)]
  test_idx = rand_list[int(len(rand_list)*PERC_TEST):]
  train_set = DS(df_group.iloc[train_idx,:])
  test_set = DS(df_group.iloc[test_idx,:])

  train_loader=DataLoader(dataset= train_set, batch_size = 4, shuffle = True, num_workers = 2)
  test_loader=DataLoader(dataset= test_set, batch_size = 4, shuffle = True, num_workers = 2)

  train_epoch(train_loader,model,criterion,optimizer,num_epochs= 100)
  loss = test_evaluation(test_loader,model,criterion)
  log[name] = loss

  features = np.array(df_all.iloc[:,:-1])
  labels = np.array(df_all.iloc[:,-1])

  if torch.has_cuda:
    device = torch.device('cuda:0')
    model.to(device)
  else:
    device = torch.device('cpu:0')

  predictions = model(torch.tensor(features, dtype=torch.float32).to(device))
  predictions  = predictions.cpu().detach().numpy()


  all_pred[name] = pd.Series(predictions.reshape(-1))

print(log)

started training epoch no. 1


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2
started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training e

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training epoch no. 35
started training 

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2
started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training e

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2


  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training epoch no. 35
started training 

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training epoch no. 35
started training 

  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2
started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28
started training epoch no. 29
started training epoch no. 30
started training epoch no. 31
started training epoch no. 32
started training epoch no. 33
started training epoch no. 34
started training e

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


started training epoch no. 2
started training epoch no. 3
started training epoch no. 4
started training epoch no. 5
started training epoch no. 6
started training epoch no. 7
started training epoch no. 8
started training epoch no. 9
started training epoch no. 10
started training epoch no. 11
started training epoch no. 12
started training epoch no. 13
started training epoch no. 14
started training epoch no. 15
started training epoch no. 16
started training epoch no. 17
started training epoch no. 18
started training epoch no. 19
started training epoch no. 20
started training epoch no. 21
started training epoch no. 22
started training epoch no. 23
started training epoch no. 24
started training epoch no. 25
started training epoch no. 26
started training epoch no. 27
started training epoch no. 28


In [None]:
frames = []
df_pred = df.copy()
for name, group in df_pred.groupby("dataset"):
  group['Predictions_MLP'] = all_pred[name]
  frames.append(group)

In [None]:
full_prediction_result = pd.concat(frames)

In [None]:
full_prediction_result.head(3)

In [None]:
full_prediction_result.to_csv('combined_data_with_predictions_on_separate_datasets_both.csv', index=False)

In [None]:
#Shouldve turned it into a variable before, but saving it now
log['all_dataset_loss'] = all_dataset_loss

In [None]:
pd.DataFrame.from_dict(log,orient="index").to_csv('test_loss_on_MLP.csv', index=True)

In [None]:
!git add ./combined_data_with_predictions_on_separate_datasets_both.csv
!git add ./test_loss_on_MLP.csv
!git config --global user.email "shaulsolomon@gmail.com"
!git config --global user.name "Shaul Solomon"
!git commit -m "Added MLP pred for all datasets"
!git push