In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/Colab Notebooks/NLP_Project/

/content/drive/MyDrive/Colab Notebooks/NLP_Project


In [3]:
%ls

snli_1.0_dev.csv  snli_1.0_test.csv  snli_1.0_train.csv


In [4]:
import pandas as pd

In [5]:
!pip install transformers



In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
# import warnings
# warnings.filterwarnings('ignore')

Exploring the dataset:

In [7]:
train_cols = ['gold_label', 'sentence1', 'sentence2']
test_cols = ['sentence1', 'sentence2']
train_df = pd.read_csv('snli_1.0_train.csv', engine='python', usecols=train_cols)
test_df = pd.read_csv('snli_1.0_test.csv', engine='python', usecols=test_cols)

In [8]:
test_df.info()
# No missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  10000 non-null  object
 1   sentence2  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [9]:
train_df.info()

# There are missing values, so we need to drop those rows.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550152 entries, 0 to 550151
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   gold_label  550152 non-null  object
 1   sentence1   550152 non-null  object
 2   sentence2   550146 non-null  object
dtypes: object(3)
memory usage: 12.6+ MB


In [10]:
train_df = train_df[~train_df['sentence2'].isnull()]

In [11]:
train_df.info()

# Problem fixed! 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550146 entries, 0 to 550151
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   gold_label  550146 non-null  object
 1   sentence1   550146 non-null  object
 2   sentence2   550146 non-null  object
dtypes: object(3)
memory usage: 16.8+ MB


In [12]:
# Let's check if there are weird labels:
train_df.gold_label.value_counts()

# There are 785 rows with undifined category, for convenience, let's drop them too.

entailment       183414
contradiction    183185
neutral          182762
-                   785
Name: gold_label, dtype: int64

In [13]:
train_df = train_df[(train_df.gold_label != "-")]

In [14]:
# check:
train_df.gold_label.value_counts()

# Problem fixed! 

entailment       183414
contradiction    183185
neutral          182762
Name: gold_label, dtype: int64

In [15]:
print(train_df.shape)
print(test_df.shape)

(549361, 3)
(10000, 2)


In [16]:
train_df.head()

Unnamed: 0,gold_label,sentence1,sentence2
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,entailment,Children smiling and waving at camera,There are children present


In [17]:
# For performance reasons, we'll only use 3,000 sentences from the dataset

batch_1 = train_df[:3000]

In [18]:
batch_1.shape

(3000, 3)

In [19]:
batch_1['gold_label'].value_counts()

# We can see the distribution of the label is quite similar.

entailment       1005
contradiction    1000
neutral           995
Name: gold_label, dtype: int64

Loading the Pre-trained BERT model

In [20]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Preparing the Dataset

In [21]:
# Tokenizing the sentences:

s1_tokenized = batch_1['sentence1'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
s2_tokenized = batch_1['sentence2'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [22]:
# Padding:

# First let's find the max length of sentence1 and sentence2 columns:

# Sentence1
max_len_s1 = 0
for i in s1_tokenized.values:
    if len(i) > max_len_s1:
        max_len_s1 = len(i)

# Sentence2
max_len_s2 = 0
for i in s2_tokenized.values:
    if len(i) > max_len_s2:
        max_len_s2 = len(i)

# We will take the maximum value"
max_len = max(max_len_s1, max_len_s2)
print("Max Length:", max_len)

Max Length: 64


In [23]:
# And do the padding:

padded_s1 = np.array([i + [0]*(max_len-len(i)) for i in s1_tokenized.values])
padded_s2 = np.array([i + [0]*(max_len-len(i)) for i in s2_tokenized.values])

In [24]:
# Check:
np.array(padded_s1).shape

# 3,000 is the number of rows I chose for the batch and 52 is the max legth
# Now each sentence padded to be in length of 52.

(3000, 64)

In [25]:
# Masking
# If we directly send padded to BERT, that would slightly confuse it. 
# We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. 
# That's what attention_mask is:

attention_mask = np.where(padded_s1 != 0, 1, 0)
attention_mask.shape

(3000, 64)

Modeling

In [26]:
input_ids_s1 = torch.tensor(padded_s1)
input_ids_s2 = torch.tensor(padded_s2)    
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states_s1 = model(input_ids_s1, attention_mask=attention_mask)

In [27]:
with torch.no_grad():
    last_hidden_states_s2 = model(input_ids_s2, attention_mask=attention_mask)

In [28]:
print(input_ids_s1)

tensor([[ 101, 1037, 2711,  ...,    0,    0,    0],
        [ 101, 1037, 2711,  ...,    0,    0,    0],
        [ 101, 1037, 2711,  ...,    0,    0,    0],
        ...,
        [ 101, 2093, 3337,  ...,    0,    0,    0],
        [ 101, 1037, 2210,  ...,    0,    0,    0],
        [ 101, 1037, 2210,  ...,    0,    0,    0]])


In [29]:
features_s1 = last_hidden_states_s1[0][:,0,:].numpy()
features_s2 = last_hidden_states_s2[0][:,0,:].numpy()

In [30]:
features_s1.shape

(3000, 768)

In [31]:
features = np.concatenate((features_s1, features_s2), axis=1)

In [32]:
features.shape

(3000, 1536)

In [33]:
labels = batch_1.gold_label

Creating Logistic regression:

In [34]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [35]:
# Searching for the best value of the C parameter, which determines regularization strength.

# parameters = {'C': np.linspace(0.0001, 100, 20)}
# grid_search = GridSearchCV(LogisticRegression(), parameters)
# grid_search.fit(train_features, train_labels)

# print('best parameters: ', grid_search.best_params_)
# print('best scrores: ', grid_search.best_score_)

In [36]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
# Evaluating model's results:
lr_clf.score(test_features, test_labels)

0.49733333333333335