In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

file_path =  "/content/drive/MyDrive/final_data.csv"
# file_path = "final_data.csv"
data = pd.read_csv(file_path)

print(data.head())

      dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0  0.020622 -0.043543  0.017362  0.024762 -0.020766 -0.017783 -0.023139   
1 -0.014496 -0.008514  0.001838 -0.013815 -0.023862  0.021470 -0.007265   
2  0.021526 -0.021832  0.063153 -0.033886 -0.009450 -0.035176  0.080432   
3 -0.047248  0.018531 -0.021601  0.054370 -0.024639  0.022130  0.068030   
4  0.038407 -0.026316 -0.016674 -0.013441  0.014775  0.005128  0.034026   

      dim_8     dim_9    dim_10  ...   dim_298   dim_299   dim_300  chunk  \
0  0.015411 -0.021622 -0.045250  ... -0.002109 -0.018631 -0.002061     NP   
1  0.014412  0.024713  0.020350  ... -0.026965  0.007601  0.007057    NP2   
2  0.158831 -0.014669 -0.037460  ...  0.101427 -0.090182  0.008483   VGNF   
3 -0.017688  0.005726 -0.042218  ...  0.032187  0.065817  0.067730    NP3   
4  0.008274 -0.023595 -0.039830  ...  0.047305 -0.050628 -0.014780    NP4   

   postposition  head-postag   dependency  is_arg       srl  predicate  
0            

**DATASET**
- For the dataset we have retrieved the dataset from a paper source, the dataset contains the word embeddings of 300 dimension
- Along with this the dataset also contain the columms with posposition, head-POS , srl , predicate and etc
- But out of these we just need the embeddings for the classification and the label would the be SRL
- So for the preparation of the data, we just use these attributes as for this project

In [None]:
labels = data['srl']
print(labels)

0             NaN
1            ARG1
2             NaN
3        ARGM-LOC
4            ARG0
           ...   
14407         NaN
14408    ARGM-PRP
14409    ARGM-LOC
14410        ARG1
14411         NaN
Name: srl, Length: 14412, dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_labels(data):
    le = LabelEncoder()

    le.fit(data)

    encoded_data = le.transform(data)

    uniq_labels = list(le.classes_)

    return encoded_data, uniq_labels, le

encoded_labels, uniq_labels, decoder = encode_labels(labels)
data['srl'] = encoded_labels
print(uniq_labels)
# print(len(encoded_labels))

['ARG-UNDEF', 'ARG0', 'ARG1', 'ARG2', 'ARG2-ATR', 'ARG2-GOL', 'ARG2-LOC', 'ARG2-SOU', 'ARG3', 'ARGM-ADV', 'ARGM-CAU', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-EXT', 'ARGM-LOC', 'ARGM-MNR', 'ARGM-MNS', 'ARGM-MOD', 'ARGM-NEG', 'ARGM-PRP', 'ARGM-PRX', 'ARGM-TMP', nan]


**LABEL ENCODER**
- For the labels we need to encode them, so using the LabelEncoder() we convert them into numerical values and store the encoded data.
- The Fit(data) trains it on the data and then retrieves the unique labels after encoding.
- Now instead of storing the labels directly we store them as the encoded form

In [None]:
#redundant_cols = [ 'chunk', 'postposition', 'head-postag', 'dependency', 'is_arg', 'srl', 'predicate']
#dropping all the unnecessary columns from the file
redundant_cols = [ 'postposition', 'head-postag', 'is_arg', 'srl', 'predicate']
x = data.drop(redundant_cols,axis = 1)
#-->, 'predicate', 'dependency' --> redundant_cols = [ 'chunk', 'postposition', 'head-postag','is_arg', 'srl']
x['chunk'] = LabelEncoder().fit_transform(x['chunk'])

x['dependency'] = LabelEncoder().fit_transform(x['dependency'])



print(x.shape)

(14412, 300)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, encoded_labels, test_size=0.2, random_state=42)

print(X_train.shape,X_test.shape)

(11529, 300) (2883, 300)


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(y_pred)


[22 22 22 ... 22  2 22]


## SRL USING THE LOGISTIC REGRESSION
- Here we have used the logistic regression for the classification
- We give the train data and the train labels for the fitting and then test the model upon the testing data and get the predictions.
- And the predictions we got we compare it with the actaul results and give out the accuracy

- In this case we get an accuracy of 62%

In [None]:
y_pred_labels = decoder.inverse_transform(y_pred)
y_test_labels = decoder.inverse_transform(y_test)

In [None]:
print(y_test_labels[:10])
print(y_pred_labels[:10])

['ARGM-PRP' 'ARGM-PRP' nan nan nan nan nan 'ARG1' nan nan]
[nan nan nan nan nan nan nan 'ARG1' nan nan]


In [None]:
from sklearn.metrics import classification_report
y_pred_float = y_pred.astype(str)
y_test_float = y_test.astype(str)

# Now use classification_report
print(classification_report(y_test_float, y_pred_float))

              precision    recall  f1-score   support

           1       0.45      0.16      0.23       189
          10       0.00      0.00      0.00        20
          11       0.00      0.00      0.00         6
          12       0.00      0.00      0.00        16
          13       0.00      0.00      0.00        18
          14       0.33      0.04      0.07       133
          15       0.40      0.03      0.06        62
          16       0.00      0.00      0.00        10
          19       0.00      0.00      0.00        17
           2       0.61      0.30      0.40       397
          21       0.38      0.16      0.22        69
          22       0.65      0.94      0.77      1770
           3       0.00      0.00      0.00        41
           4       0.17      0.01      0.03        72
           5       0.00      0.00      0.00        14
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         9
           9       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test_float, y_pred_float)

print("Accuracy when used logistic regression model",accuracy)


Accuracy when used logistic regression model 0.6344086021505376


In [None]:
class_report = pd.DataFrame(classification_report(y_test_float, y_pred_float,output_dict=True)).transpose()
class_report.to_csv("classification_report_Logistic_Regression.csv")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SRL using SVM

In [None]:
from sklearn import svm

clf = svm.SVC(kernel='linear')

clf.fit(X_train, y_train)

y_pred_svm = clf.predict(X_test)

y_pred_svm_labels = decoder.inverse_transform(y_pred_svm)
y_test_labels = decoder.inverse_transform(y_test)


In [None]:
print(classification_report(y_test, y_pred_svm))

accuracy = accuracy_score(y_test, y_pred_svm)

print("Accuracy when used svm model",accuracy)

              precision    recall  f1-score   support

           1       0.40      0.12      0.19       189
           2       0.69      0.22      0.34       397
           3       0.00      0.00      0.00        41
           4       0.33      0.01      0.03        72
           5       0.00      0.00      0.00        14
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         9
           9       0.00      0.00      0.00        33
          10       0.00      0.00      0.00        20
          11       0.00      0.00      0.00         6
          12       0.00      0.00      0.00        16
          13       0.00      0.00      0.00        18
          14       0.00      0.00      0.00       133
          15       0.00      0.00      0.00        62
          16       0.00      0.00      0.00        10
          19       0.00      0.00      0.00        17
          21       0.60      0.04      0.08        69
          22       0.64    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class_report = pd.DataFrame(classification_report(y_test, y_pred_svm,output_dict=True)).transpose()
class_report.to_csv("classification_report_SVM.csv")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- Instead of using the logistic regression or any other classifier we can make use of neural networks for this task.

### SRL using Neural Nets

In [None]:
import pandas as pd

def encode_labels(data):
    le = LabelEncoder()

    le.fit(data)

    encoded_data = le.transform(data)

    uniq_labels = list(le.classes_)

    return encoded_data, uniq_labels, le

file_path = "final_data.csv"
data = pd.read_csv(file_path)

data['srl'] , classes , label_encoder = encode_labels(data['srl'])
print(data.head())

y_data = data['srl']
X_train, X_test, y_train, y_test = train_test_split(data.drop(['srl'], axis=1), y_data , test_size=0.15, random_state=42)

X_train , X_val  , y_train, y_val = train_test_split(X_train , y_train , test_size=0.1, random_state=None)
print("Train shape :", X_train.shape, y_train.shape)
print("Test shape :", X_val.shape, y_val.shape)
print("Test shape :", X_test.shape, y_test.shape)



      dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0  0.020622 -0.043543  0.017362  0.024762 -0.020766 -0.017783 -0.023139   
1 -0.014496 -0.008514  0.001838 -0.013815 -0.023862  0.021470 -0.007265   
2  0.021526 -0.021832  0.063153 -0.033886 -0.009450 -0.035176  0.080432   
3 -0.047248  0.018531 -0.021601  0.054370 -0.024639  0.022130  0.068030   
4  0.038407 -0.026316 -0.016674 -0.013441  0.014775  0.005128  0.034026   

      dim_8     dim_9    dim_10  ...   dim_298   dim_299   dim_300  chunk  \
0  0.015411 -0.021622 -0.045250  ... -0.002109 -0.018631 -0.002061     NP   
1  0.014412  0.024713  0.020350  ... -0.026965  0.007601  0.007057    NP2   
2  0.158831 -0.014669 -0.037460  ...  0.101427 -0.090182  0.008483   VGNF   
3 -0.017688  0.005726 -0.042218  ...  0.032187  0.065817  0.067730    NP3   
4  0.008274 -0.023595 -0.039830  ...  0.047305 -0.050628 -0.014780    NP4   

   postposition  head-postag   dependency  is_arg  srl  predicate  
0            का   

In [None]:
def concat(x_data , y_data):
  return pd.concat([x_data , y_data] , axis = 1)


data_train = concat(X_train , y_train)
data_val = concat(X_val , y_val)
data_test = concat(X_test , y_test)

In [None]:
import torch
import torch.utils.data as data_utils
from torch.utils.data import Dataset , DataLoader
import numpy as np

class CustomDataset(Dataset):
  def __init__(self , data):
    redundant_cols = [ 'chunk', 'postposition', 'head-postag', 'dependency', 'is_arg', 'srl', 'predicate']

    x = data.drop(redundant_cols,axis = 1)

    self.emb = x

    self.emb = torch.Tensor(np.array(self.emb))
    self.label  = torch.Tensor(data['srl'].values)

  def __len__(self):
    return len(self.emb)

  def __getitem__(self, index):
    x_data = self.emb[index]
    y_data = self.label[index]

    return x_data , y_data


In [None]:
EMBEDDING_DIM = 300
NUM_HIDDEN_NODES =200
NUM_OUTPUT_NODES = 5
NUM_CLASSES = 23
epochs = 50
batchsize = 128
learning_rate =0.00045

In [None]:
train_dataset = CustomDataset(data_train)
val_dataset = CustomDataset(data_val)
test_dataset = CustomDataset(data_test)
dataloader=DataLoader(dataset=train_dataset,batch_size=batchsize,shuffle=False)
valloader = DataLoader(dataset=val_dataset , batch_size=batchsize,shuffle=False)
testloader=DataLoader(dataset=test_dataset,batch_size=batchsize,shuffle=False)

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
class SRL_LSTM(nn.Module):
    def __init__(self,embeddings_dim,hidden_dim,output_dim,num_class,pretrained_embeddings=None):
        """
        Args:
        “pretrained_embeddings (numpy.array): previously trained word embeddings”
        """

        super().__init__()

        self.embeddings = pretrained_embeddings

        self.lstm = nn.LSTM(embeddings_dim,hidden_dim,num_layers=3,batch_first=True)

        self.fc = nn.Linear(hidden_dim,num_class)


    def forward(self,pretrained_embeddings):
        batch_size = pretrained_embeddings.size(1)

        outputs, (hidden, cell) = self.lstm(pretrained_embeddings)

        outputs = self.fc(outputs)


        outputs = outputs.view(batch_size, 23)

        outputs = F.log_softmax(outputs, dim=1)

        return(outputs)

In [None]:
def train_part(model , dataloader , optimizer , criterion , epoch):
    total_loss = 0.0
    total_acc=0.0

    for input_word_dim, y in dataloader:
        batch_size = input_word_dim.shape[0]

        preds = model(input_word_dim.view([1,batch_size,300]))
        y = y.type(torch.long)

        loss = criterion(preds, y)

        preds = torch.argmax(preds,dim=1)
        #print(preds)
        acc = sum(preds == y) / float(batch_size)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc+=acc.item()

    print("train loss on epoch {epoch}  is {loss} and training accuracy {accuracy}".format(epoch=epoch,loss=(total_loss/len(dataloader)),accuracy=(total_acc/len(dataloader))))
    return total_loss , total_acc

In [None]:
def test(model , testloader , optimizer , criterion , epoch):
    model.eval()  # Set the model to evaluation mode

    test_loss = 0.0
    test_acc = 0.0
    all_preds = []
    all_y = []

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for input_word_dim, y in testloader:
            batch_size = input_word_dim.shape[0]

            preds = model(input_word_dim.view([1,batch_size,300]))
            y = y.type(torch.long)

            loss = criterion(preds, y)

            # Accumulate loss
            test_loss += loss.item()

            # Calculate accuracy
            preds = torch.argmax(preds, dim=1)
            correct = (preds == y).sum().item()
            test_acc += correct / float(batch_size)

            # Store predictions and ground truth labels
            all_preds.extend(preds.tolist())
            all_y.extend(y.tolist())

    # Calculate average loss and accuracy
    avg_loss = test_loss / len(testloader)
    avg_acc = test_acc / len(testloader)

    print("test loss on epoch {epoch} is {loss} and test accuracy {accuracy}".format(epoch=epoch, loss=avg_loss, accuracy=avg_acc))

    return all_preds, all_y, avg_loss, avg_acc
'''
    test_loss = 0.0
    test_acc=0.0
    all_preds =np.zeros(0)
    all_y =  np.zeros(0)
    for input_word_dim,y in testloader:
        batch_size = input_word_dim.shape[0]

        preds = model(input_word_dim.view([1,batch_size,300]))
        y = y.type(torch.long)

        loss = criterion(preds, y)
        #print("Loss {}".format(loss))
        #print(y)
        preds = torch.argmax(preds,dim=1)
        #print(preds)
        acc = sum(preds == y) / float(batch_size)
        #acc=model_accuracy(preds, y)


        all_preds = np.append(all_preds,np.array(preds))
        all_y = np.append(all_y,np.array(y))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        test_loss+=loss.item()
        test_acc+=acc.item()

    print("test loss on epoch {epoch}  is {loss} and test accuracy {accuracy}".format(epoch=epoch,loss=(test_loss/len(testloader)),accuracy=(test_acc/len(testloader))))
    return all_preds , all_y , test_loss , test_acc
'''



'\n    test_loss = 0.0\n    test_acc=0.0\n    all_preds =np.zeros(0)\n    all_y =  np.zeros(0)\n    for input_word_dim,y in testloader:\n        batch_size = input_word_dim.shape[0]\n\n        preds = model(input_word_dim.view([1,batch_size,300]))\n        y = y.type(torch.long)\n\n        loss = criterion(preds, y)\n        #print("Loss {}".format(loss))\n        #print(y)\n        preds = torch.argmax(preds,dim=1)\n        #print(preds)\n        acc = sum(preds == y) / float(batch_size)\n        #acc=model_accuracy(preds, y)\n\n\n        all_preds = np.append(all_preds,np.array(preds))\n        all_y = np.append(all_y,np.array(y))\n\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n\n        test_loss+=loss.item()\n        test_acc+=acc.item() \n\n    print("test loss on epoch {epoch}  is {loss} and test accuracy {accuracy}".format(epoch=epoch,loss=(test_loss/len(testloader)),accuracy=(test_acc/len(testloader))))\n    return all_preds , all_y , test_l

In [None]:
def train_model(model,dataloader,testloader,epochs,optimizer,criterion):
    epoch_list = []
    train_loss_list = []
    test_loss_list = []
    train_acc_list = []
    test_acc_list = []

    for epoch in range(epochs):
        total_loss , total_acc = train_part(model , dataloader , optimizer , criterion , epoch)
        train_acc_list.append((total_acc/len(dataloader)))
        train_loss_list.append((total_loss/len(dataloader)))

        all_preds , all_y , test_loss , test_acc = test(model , testloader , optimizer , criterion , epoch)
        test_acc_list.append((test_acc/len(testloader)))
        test_loss_list.append((test_loss/len(testloader)))

        epoch_list.append(epoch)

    return(train_loss_list,test_loss_list,train_acc_list,test_acc_list,all_preds,all_y,epoch_list)


In [None]:
import torch.optim as optim

model = SRL_LSTM(embeddings_dim=EMBEDDING_DIM,hidden_dim=NUM_HIDDEN_NODES,output_dim =NUM_OUTPUT_NODES,num_class=NUM_CLASSES,pretrained_embeddings=None)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
train_loss, val_loss, train_acc, val_acc,preds,Y,epoch_list = train_model(model,dataloader,valloader,epochs,optimizer,criterion)

In [None]:
test(model , testloader , optimizer , criterion , 1)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
plt.title("Training and Validation Accuracy")
plt.plot(epoch_list, train_acc)
plt.plot(epoch_list,val_acc)
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(['Train Accuracy','Val Accuracy'])
#plt.savefig('/NN_train_val_accuracy_bilstm_50e.png')

plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss")
plt.plot(epoch_list, train_loss)
plt.plot(epoch_list,val_loss)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(['Train Loss','Val Loss'])
#plt.savefig('./NN_train_val_loss_bilstm_50e.png')

In [None]:
class_report = pd.DataFrame(classification_report(Y, preds,output_dict=True)).transpose()
class_report.to_csv("./outputs/classification_report_LSTM.csv")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
