In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd

file_path =  "/content/drive/MyDrive/final_data.csv"

data = pd.read_csv(file_path)

print(data.head())

      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \
0 -0.005985  0.015627  0.007076  0.006575  0.014297  0.007125 -0.000785   
1  0.021908 -0.084995 -0.011690 -0.048774 -0.052874 -0.018173  0.058139   
2  0.014475 -0.069880 -0.003023  0.022581 -0.021881  0.012437  0.103112   
3 -0.019438 -0.003835 -0.027385  0.019167 -0.001544  0.027654 -0.007453   
4  0.000870  0.018253  0.025201 -0.015667  0.002589 -0.026993 -0.012401   

      dim_7     dim_8     dim_9  ...   dim_298   dim_299  Label     chunk  \
0 -0.009658 -0.001066 -0.043356  ...  0.008799 -0.023102      1       JJP   
1 -0.065300 -0.007800 -0.011115  ...  0.021349  0.001630      0       VGF   
2 -0.104538 -0.024584  0.046685  ... -0.050404 -0.116184      0       CCP   
3 -0.041042 -0.011774 -0.011934  ...  0.024064 -0.017355      1        NP   
4 -0.034212  0.000570  0.025318  ...  0.004674  0.013638      0  NULL__NP   

   postposition  head-POS  dependency-head  dependency       srl  predicate  
0       

**DATASET**
- For the dataset we have retrieved the dataset from a paper source, the dataset contains the word embeddings of 300 dimension
- Along with this the dataset also contain the columms with posposition, head-POS , srl , predicate and etc
- But out of these we just need the embeddings for the classification and the label would the be SRL
- So for the preparation of the data, we just use these attributes as for this project

In [None]:
labels = data['srl']
print(labels)

0        ARG2-ATR
1               0
2               0
3        ARGM-TMP
4               0
           ...   
14541           0
14542        ARG1
14543           0
14544           0
14545           0
Name: srl, Length: 14546, dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_labels(data):
    le = LabelEncoder()

    le.fit(data)

    encoded_data = le.transform(data)

    uniq_labels = list(le.classes_)

    return encoded_data, uniq_labels, le

encoded_labels, uniq_labels, decoder = encode_labels(labels)
data['srl'] = encoded_labels
print(uniq_labels)
# print(len(encoded_labels))

['0', 'ARG-UNDEF', 'ARG0', 'ARG1', 'ARG2', 'ARG2-ATR', 'ARG2-GOL', 'ARG2-LOC', 'ARG2-SOU', 'ARG3', 'ARGM-ADV', 'ARGM-CAU', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-EXT', 'ARGM-LOC', 'ARGM-MNR', 'ARGM-MNS', 'ARGM-MOD', 'ARGM-NEG', 'ARGM-PRP', 'ARGM-PRX', 'ARGM-TMP']


**LABEL ENCODER**
- For the labels we need to encode them, so using the LabelEncoder() we convert them into numerical values and store the encoded data.
- The Fit(data) trains it on the data and then retrieves the unique labels after encoding.
- Now instead of storing the labels directly we store them as the encoded form

In [None]:
redundant_cols = ['Label', 'chunk', 'postposition', 'head-POS', 'dependency-head', 'dependency', 'srl', 'predicate']
#dropping all the unnecessary columns from the file
x = data.drop(redundant_cols,axis = 1)

print(x.shape)

(14546, 300)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, encoded_labels, test_size=0.2, random_state=42)

print(X_train.shape,X_test.shape)

(11636, 300) (2910, 300)


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(y_pred)


[0 0 0 ... 0 0 0]


## SRL USING THE LOGISTIC REGRESSION
- Here we have used the logistic regression for the classification
- We give the train data and the train labels for the fitting and then test the model upon the testing data and get the predictions.
- And the predictions we got we compare it with the actaul results and give out the accuracy

- In this case we get an accuracy of 62%

In [None]:
y_pred_labels = decoder.inverse_transform(y_pred)
y_test_labels = decoder.inverse_transform(y_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

           0       0.64      0.94      0.76      1755
        ARG0       0.51      0.16      0.25       201
        ARG1       0.57      0.28      0.37       432
        ARG2       0.00      0.00      0.00        38
    ARG2-ATR       0.29      0.03      0.06        63
    ARG2-GOL       0.00      0.00      0.00         9
    ARG2-LOC       0.00      0.00      0.00         6
    ARG2-SOU       0.00      0.00      0.00         5
        ARG3       0.00      0.00      0.00         2
    ARGM-ADV       0.00      0.00      0.00        22
    ARGM-CAU       0.50      0.08      0.13        13
    ARGM-DIR       0.00      0.00      0.00         3
    ARGM-DIS       0.00      0.00      0.00        25
    ARGM-EXT       0.00      0.00      0.00        17
    ARGM-LOC       0.33      0.04      0.07       133
    ARGM-MNR       0.67      0.02      0.04        91
    ARGM-MNS       0.00      0.00      0.00         7
    ARGM-MOD       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test_labels, y_pred_labels)

print("Accuracy when used logistic regression model",accuracy)


Accuracy when used logistic regression model 0.6257731958762887


- Instead of using the logistic regression or any other classifier we can make use of neural networks for this task.

### SRL using Neural Nets

In [None]:
import pandas as pd

def encode_labels(data):
    le = LabelEncoder()

    le.fit(data)

    encoded_data = le.transform(data)

    uniq_labels = list(le.classes_)

    return encoded_data, uniq_labels, le

data = pd.read_csv(file_path)

data['srl'] , classes , label_encoder = encode_labels(data['srl'])

x_data = data.drop(['srl']
y_data = data['srl']
X_train, X_test, y_train, y_test = train_test_split(, axis=1), y_data , test_size=0.2, random_state=42)

print("Train shape :", X_train.shape, y_train.shape)
print("Test shape :", X_test.shape, y_test.shape)



In [None]:
def concat(x_data , y_data):
  return pd.concat([x_data , y_data] , axis = 1)


data_train = concat(X_train , y_train)
data_test = concat(X_test , y_test)

In [None]:
import torch
import torch.utils.data as data_utils
from torch.utils.data import Dataset
import numpy as np

class CustomDataset(Dataset):
  def __init__(self , data):
    redundant_cols = ['Label', 'chunk', 'postposition', 'head-POS', 'dependency-head', 'dependency', 'srl', 'predicate']
    #dropping all the unnecessary columns from the file
    x = data.drop(redundant_cols,axis = 1)

    self.emb = x
    #convert the above obtained data into tensor form

    self.emb = torch.Tensor(np.array(self.emb))
    self.label  = torch.Tensor(data['srl'].values)

  def __len__(self):
    return len(self.emb)

  def __getitem__(self, index):
    x_data = self.emb[index]
    y_data = self.label[index]

    return x_data , y_data


In [None]:
train_dataset = CustomDataset(data_train)
test_dataset = CustomDataset(data_test)