In [1]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing

In [2]:
def preprocess_text(text):
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

In [27]:
train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_data = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_data["label"] = test_data["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_data = test_data.drop("doc_id", axis=1)
test_data = test_data.drop("key", axis=1)

In [51]:
longest_premise = max(train_data['premise'].apply(len).max(), test_data['premise'].apply(len).max()) # Note: irl nebi imeli max test data, ampak bi rabil truncatat
longest_hypotises = max(train_data['hypothesis'].apply(len).max(), test_data['hypothesis'].apply(len).max())
longest_sentance = max(longest_premise, longest_hypotises)
print("Longest premise: ", longest_premise)
print("Longest hypothesis: ", longest_hypotises)

print("---------------------------------")
mean = np.mean(train_data['premise'].apply(len))
std = np.std(train_data['premise'].apply(len))

print("mean: ", mean)
print("+1 std: ", mean+std)
print("+2 std: ", mean+2*std)
print("+3 std: ", mean+3*std)

Longest premise:  3098
Longest hypothesis:  162
---------------------------------
mean:  296.27826449728826
+1 std:  651.2505192635402
+2 std:  1006.2227740297922
+3 std:  1361.1950287960442


In [45]:
train_data['premise'].count()

np.int64(7191)

In [46]:
(train_data['premise'].apply(len) > 128).sum()

np.int64(4239)

# Traditional ML

In [40]:
vectorizer_premise = TfidfVectorizer()
vectorizer_hypothesis = TfidfVectorizer()

train_data_vectorised = train_data.copy()

X_premise= vectorizer_premise.fit_transform(train_data["premise"])
X_hypothesis = vectorizer_hypothesis.fit_transform(train_data["hypothesis"])
train_data_vectorised = hstack([X_premise, X_hypothesis])

Y_premise = vectorizer_premise.transform(test_data["premise"])
Y_hypothesis = vectorizer_hypothesis.transform(test_data["hypothesis"])
test_data_vectorised = hstack([Y_premise, Y_hypothesis])

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


logreg_model = LogisticRegression(max_iter=5000)

logreg_model.fit(train_data_vectorised, train_data["label"])
predictions = logreg_model.predict(test_data_vectorised)

print(classification_report(test_data["label"], predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1871
           1       0.81      0.75      0.78       220

    accuracy                           0.96      2091
   macro avg       0.89      0.86      0.88      2091
weighted avg       0.95      0.96      0.95      2091



In [30]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=67
)

rf_model.fit(train_data_vectorised, train_data["label"])

predictions = rf_model.predict(test_data_vectorised)

print(classification_report(test_data["label"], predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1871
           1       0.90      0.74      0.81       220

    accuracy                           0.96      2091
   macro avg       0.93      0.86      0.89      2091
weighted avg       0.96      0.96      0.96      2091



In [31]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_model = SVC(
	kernel='linear',
	C=1.0,
	random_state=67
)

svm_model.fit(train_data_vectorised, train_data["label"])
predictions = svm_model.predict(test_data_vectorised)

print(classification_report(test_data["label"], predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1871
           1       0.84      0.79      0.81       220

    accuracy                           0.96      2091
   macro avg       0.91      0.88      0.90      2091
weighted avg       0.96      0.96      0.96      2091



# Transformer-Based Classifier

In [59]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
	t =  tokenizer(examples["premise"], examples["hypothesis"], truncation="only_first", stride=64, return_overflowing_tokens=True, padding='max_length', max_length=128)
	t["labels"] = examples["label"]
	return t

In [69]:
dataset = Dataset.from_pandas(train_data)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 7191/7191 [00:18<00:00, 386.50 examples/s]


In [71]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
	output_dir="./artifacts",
	learning_rate=2e-5,
	per_device_train_batch_size=16,
	num_train_epochs=2,
	weight_decay=0.01,
	save_strategy="steps",
	save_steps=500, 
	save_total_limit=3
)

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
)

trainer.train(resume_from_checkpoint=True)
tokenizer.save_pretrained("./trained_model_ex3")
trainer.save_model("./trained_model_ex3")



Step,Training Loss


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./trained_model_ex3", tokenizer="./trained_model_ex3")

Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.726703405380249}]


In [67]:
test_dataset = Dataset.from_pandas(test_data)
tokenized_dataset = test_dataset.map(preprocess_function, batched=True)

result = classifier(tokenized_dataset)
print(result)

Map: 100%|██████████| 2091/2091 [00:05<00:00, 383.00 examples/s]


ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [68]:
from transformers import Trainer

trainer = Trainer(model=model)  # no need for args for evaluation
predictions = trainer.predict(tokenized_dataset)

# Get predicted labels
preds = predictions.predictions.argmax(-1)
print(preds)



[0 0 0 ... 0 0 0]
