In [9]:
!pip install nltk==3.8.1 sentence-transformers

[0m

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, BatchNormalization, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D, Input, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing import sequence, text
import torch
import torch.nn as nn
import torch.optim as optim
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from pathlib import Path
import random
from collections import defaultdict
from itertools import chain, groupby
from typing import Any, List, Optional, Union
import joblib
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, SentenceTransformer, losses

In [11]:
def read_csv(name):
    df = pd.read_csv(name)
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    return df

def convert_truth_values(df):
    df.loc[df['truth_value'] == 'tom_ruling_pof', 'truth_value'] = 'meter-false'
    df.loc[df['truth_value'] == 'meter-half-true', 'truth_value'] = 'meter-true'
    df.loc[df['truth_value'] == 'meter-mostly-true', 'truth_value'] = 'meter-true'
    df.loc[df['truth_value'] == 'meter-mostly-false', 'truth_value'] = 'meter-false'
    return df

def drop_na(df):
    df = df.dropna(subset=['claim'])
    return df

def label_encode(df, column_name):
    le = LabelEncoder()
    df[column_name] = le.fit_transform(df[column_name])
    return df

stemmer = SnowballStemmer("english")
def stemm_text(text):
    return ' '.join([stemmer.stem(w) for w in text.split(' ')])

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

def replace_newlines(series):
    return series.str.replace('-',' ').str.replace('[^\w\s]','').str.replace('\n',' ').str.lower()

def remove_digits(series):
    return series.apply(lambda x: ' '.join([y for y in x.split() if not y.isdigit()]))

def remove_stopwords(series):
    stop = stopwords.words('english')
    return series.apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

def _train_test_split(X, y):
    return train_test_split(
        X, y, stratify=y, random_state=42, test_size=0.4, shuffle=True)

def count_vectorize(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

def tf_idf(data, max_features=None):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

def run_ml_model(classifier, X_train, y_train, X_test, y_test, classifier_name='ML Model'):
    classifier.fit(X_train, y_train)
    predictions = classifier.predict_proba(X_test)
    predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

    accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
    print(f'{classifier_name} accuracy - {accuracy}')

def set_torch_seed(val=42):
    torch.manual_seed(val)

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def tokenize_and_generate_sequences(X_train, y_train, X_test, y_test):
    # Tokenize the training data
    tokenizer = text.Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(X_train)
    tokenized_train = tokenizer.texts_to_sequences(X_train)
    X_train = torch.tensor(sequence.pad_sequences(tokenized_train, maxlen=60)).to(device)

    # Tokenize the test data
    tokenized_test = tokenizer.texts_to_sequences(X_test)
    X_test = torch.tensor(sequence.pad_sequences(tokenized_test, maxlen=60)).to(device)

    # Convert labels to tensors
    y_train = torch.tensor(y_train.values).float().to(device)
    y_test = torch.tensor(y_test.values).float().to(device)
    
    return X_train, y_train, X_test, y_test


In [12]:
df = read_csv('/kaggle/input/ru-ukr-onlyv4/ru-ukr-onlyv4.csv')
df = convert_truth_values(df)
df = drop_na(df)
df = label_encode(df, 'truth_value')
df['claim'] = replace_newlines(df['claim'])
df['claim'] = remove_digits(df['claim'])
df['claim'] = remove_stopwords(df['claim'])
df.head()



Unnamed: 0,claim,truth_value
0,provocation disinformation overview,0
1,ukraine theft homicide levels rose due power o...,0
2,ukrainians beat two berlin residents speaking ...,0
3,quote paul goebbels banderites,0
4,culture good neighborliness course ukrainian s...,0


In [13]:
x_train, x_test, y_train, y_test = _train_test_split(df['claim'], df['truth_value'])
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1609,), (1609,), (1074,), (1074,))

In [14]:
x_train_counts, count_vectorizer = count_vectorize(x_train)
x_test_counts = count_vectorizer.transform(x_test)
x_train_counts.shape, x_test_counts.shape

((1609, 4501), (1074, 4501))

In [15]:
x_train_tfidf, tfidf_vectorizer = tf_idf(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)
x_train_tfidf.shape, x_test_tfidf.shape

((1609, 4501), (1074, 4501))

In [16]:
#Check the maximum number of words in a sentence
#Helps with padding
x_train.apply(lambda x: len(str(x).split())).max()

49

# Machine Learning methods

In [17]:
run_ml_model(LogisticRegression(C=1.0, multi_class='ovr', solver='liblinear'), x_train_counts, y_train, x_test_counts, y_test, 
             'LogisticRegression with Bag of words')
run_ml_model(LogisticRegression(C=1.0, multi_class='ovr', solver='liblinear'), x_train_tfidf, y_train, x_test_tfidf, y_test, 
             'LogisticRegression with TF IDF')
run_ml_model(MultinomialNB(), x_train_counts, y_train, x_test_counts, y_test, 
             'MultinomialNB with Bag of words')
run_ml_model(MultinomialNB(), x_train_tfidf, y_train, x_test_tfidf, y_test, 
             'MultinomialNB with TF IDF')
run_ml_model(SVC(C=1.0, probability=True), x_train_counts, y_train, x_test_counts, y_test, 
             'SVC with Bag of words')
run_ml_model(SVC(C=1.0, probability=True), x_train_tfidf, y_train, x_test_tfidf, y_test, 
             'SVC with TF IDF')
run_ml_model(RandomForestClassifier(), x_train_counts, y_train, x_test_counts, y_test, 
             'RandomForestClassifier with Bag of words')
run_ml_model(RandomForestClassifier(), x_train_tfidf, y_train, x_test_tfidf, y_test, 
             'RandomForestClassifier with TF IDF')

LogisticRegression with Bag of words accuracy - 0.8780260707635009
LogisticRegression with TF IDF accuracy - 0.8696461824953445
MultinomialNB with Bag of words accuracy - 0.8770949720670391
MultinomialNB with TF IDF accuracy - 0.8659217877094972
SVC with Bag of words accuracy - 0.8864059590316573
SVC with TF IDF accuracy - 0.88268156424581
RandomForestClassifier with Bag of words accuracy - 0.8594040968342644
RandomForestClassifier with TF IDF accuracy - 0.8361266294227188


# Deep Learning Models

## Architecture 1
Can be used by datasets with input as tokenized sentences

In [18]:
class BiLSTM1(nn.Module):
    def __init__(self, num_words, embed_size, hidden_size, output_size, dropout_rate):
        super(BiLSTM1, self).__init__()
        self.embedding = nn.Embedding(num_words, embed_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.bilstm1 = nn.LSTM(embed_size, hidden_size, batch_first=True, bidirectional=True)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x, _ = self.bilstm1(x)
        x, _ = self.bilstm2(x)
        x = self.fc(x[:, -1, :])
        x = self.sigmoid(x)
        return x

## Architecture 2
Can be used by tf-idf or bad-of-words representations

In [19]:
class BiLSTM2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(BiLSTM2, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.bilstm1 = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, _ = self.bilstm1(x)
        x = self.dropout(x)
        x, _ = self.bilstm2(x)
        x = x[:, :].unsqueeze(1)
        x = self.fc(x[:, -1, :])
        x = self.sigmoid(x)
        return x

In [20]:
def train_test(model, criterion, optimizer, epochs, x_train, y_train, x_test, y_test, convert_to='long'):
    y_train = y_train.float()
    y_test = y_test.float()
    if convert_to == 'long':
        x_train = x_train.long()
        x_test = x_test.long()
    elif convert_to == 'float':
        x_train = x_train.float()
        x_test = x_test.float()
    model.train()
    train_accuracy = 0
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(x_train)
        loss = criterion(outputs.squeeze().to(device), y_train.to(device))
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predictions = model(x_train.to(device))
            predictions = (predictions > 0.5).to('cpu').int().squeeze().numpy()
        _train_accuracy = metrics.accuracy_score(y_train.to('cpu'), predictions)
        train_accuracy += _train_accuracy

    train_accuracy /= epochs
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        predictions = model(x_test.to(device))
        predictions = (predictions > 0.5).to('cpu').int().squeeze().numpy()

    test_accuracy = metrics.accuracy_score(y_test.to('cpu'), predictions)
    print(f"Total Epochs: {epochs}, Train Accuracy: {train_accuracy} Test Accuracy: {test_accuracy}")
    return train_accuracy, test_accuracy

In [21]:
set_torch_seed()
device = get_device()

In [22]:
x_train_tk, y_train_tk, x_test_tk, y_test_tk = tokenize_and_generate_sequences(x_train, y_train, x_test, y_test)
x_train_tk.shape, y_train_tk.shape, x_test_tk.shape, y_test_tk.shape

(torch.Size([1609, 60]),
 torch.Size([1609]),
 torch.Size([1074, 60]),
 torch.Size([1074]))

In [24]:
te = 25
for e in range(1, te+1):
    model = BiLSTM1(num_words=1000, embed_size=60, hidden_size=64, output_size=1, dropout_rate=0.2).to(device)
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    train_test(model, criterion, optimizer, e, x_train_tk, y_train_tk, x_test_tk, y_test_tk)

Total Epochs: 1, Train Accuracy: 0.6432566811684276 Test Accuracy: 0.6787709497206704
Total Epochs: 2, Train Accuracy: 0.6190180236171535 Test Accuracy: 0.6024208566108007
Total Epochs: 3, Train Accuracy: 0.6668738346799254 Test Accuracy: 0.6824953445065177
Total Epochs: 4, Train Accuracy: 0.6884711000621504 Test Accuracy: 0.7318435754189944
Total Epochs: 5, Train Accuracy: 0.6899937849596023 Test Accuracy: 0.7150837988826816
Total Epochs: 6, Train Accuracy: 0.6935985083903046 Test Accuracy: 0.7150837988826816
Total Epochs: 7, Train Accuracy: 0.6969723874633756 Test Accuracy: 0.7579143389199255
Total Epochs: 8, Train Accuracy: 0.7277812305779987 Test Accuracy: 0.7364990689013036
Total Epochs: 9, Train Accuracy: 0.7159035978178303 Test Accuracy: 0.74487895716946
Total Epochs: 10, Train Accuracy: 0.7625233064014916 Test Accuracy: 0.7905027932960894
Total Epochs: 11, Train Accuracy: 0.7562574156732018 Test Accuracy: 0.792364990689013
Total Epochs: 12, Train Accuracy: 0.7737207375181271 Te

In [25]:
x = pd.concat([x_train, x_test])
x = x.apply(lambda xx: stemm_text(xx))
split_len = x_train.shape[0]
x = [x[i:i+split_len] for i in range(0, len(x), split_len)]
x_train_stemmed = x[0]
x_test_stemmed = x[1]
x_train_stemmed.shape, x_test_stemmed.shape

((1609,), (1074,))

In [26]:
x_train_stemmed_tk, y_train_tk, x_test_stemmed_tk, y_test_tk = tokenize_and_generate_sequences(x_train_stemmed, y_train, x_test_stemmed, y_test)
x_train_stemmed_tk.shape, y_train_tk.shape, x_test_stemmed_tk.shape, y_test_tk.shape

(torch.Size([1609, 60]),
 torch.Size([1609]),
 torch.Size([1074, 60]),
 torch.Size([1074]))

In [27]:
te = 25
for e in range(1, te+1):
    model = BiLSTM1(num_words=1000, embed_size=60, hidden_size=64, output_size=1, dropout_rate=0.2).to(device)
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    train_test(model, criterion, optimizer, e, x_train_stemmed_tk, y_train_tk, x_test_stemmed_tk, y_test_tk)

Total Epochs: 1, Train Accuracy: 0.6687383467992541 Test Accuracy: 0.6880819366852886
Total Epochs: 2, Train Accuracy: 0.6733996270975762 Test Accuracy: 0.6312849162011173
Total Epochs: 3, Train Accuracy: 0.6728817070644292 Test Accuracy: 0.7104283054003724
Total Epochs: 4, Train Accuracy: 0.6452765692977005 Test Accuracy: 0.6266294227188082
Total Epochs: 5, Train Accuracy: 0.7006836544437538 Test Accuracy: 0.7383612662942272
Total Epochs: 6, Train Accuracy: 0.725088046405635 Test Accuracy: 0.7216014897579144
Total Epochs: 7, Train Accuracy: 0.7534404687916185 Test Accuracy: 0.7597765363128491
Total Epochs: 8, Train Accuracy: 0.7417650714729646 Test Accuracy: 0.7588454376163873
Total Epochs: 9, Train Accuracy: 0.735377391064153 Test Accuracy: 0.7420856610800745
Total Epochs: 10, Train Accuracy: 0.7438781852082039 Test Accuracy: 0.7625698324022346
Total Epochs: 11, Train Accuracy: 0.7506073789479631 Test Accuracy: 0.7802607076350093
Total Epochs: 12, Train Accuracy: 0.7544541122850633 T

In [28]:
# nltk.download('wordnet')
x = pd.concat([x_train, x_test])
x.reset_index(drop=True, inplace=True)
x = x.apply(lambda yy: lemmatize_text(yy))
split_len = x_train.shape[0]
x = [x[i:i+split_len] for i in range(0, len(x), split_len)]
x_train_lemmatize = x[0]
x_test_lemmatize = x[1]
x_train_lemmatize.shape, x_test_lemmatize.shape

((1609,), (1074,))

In [29]:
x_train_lemmatize_tk, y_train_tk, x_test_lemmatize_tk, y_test_tk = tokenize_and_generate_sequences(x_train_lemmatize, y_train, x_test_lemmatize, y_test)
x_train_lemmatize_tk.shape, y_train_tk.shape, x_test_lemmatize_tk.shape, y_test_tk.shape

(torch.Size([1609, 60]),
 torch.Size([1609]),
 torch.Size([1074, 60]),
 torch.Size([1074]))

In [30]:
te = 50
for e in range(1, te+1):
    model = BiLSTM1(num_words=1000, embed_size=60, hidden_size=64, output_size=1, dropout_rate=0.2).to(device)
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    train_test(model, criterion, optimizer, e, x_train_lemmatize_tk, y_train_tk, x_test_lemmatize_tk, y_test_tk, 'long')

Total Epochs: 1, Train Accuracy: 0.6556867619639528 Test Accuracy: 0.6834264432029795
Total Epochs: 2, Train Accuracy: 0.6793039154754505 Test Accuracy: 0.6517690875232774
Total Epochs: 3, Train Accuracy: 0.62958359229335 Test Accuracy: 0.6722532588454376
Total Epochs: 4, Train Accuracy: 0.699658172778123 Test Accuracy: 0.6973929236499069
Total Epochs: 5, Train Accuracy: 0.6944686140459913 Test Accuracy: 0.7188081936685289
Total Epochs: 6, Train Accuracy: 0.722084110213383 Test Accuracy: 0.7337057728119181
Total Epochs: 7, Train Accuracy: 0.7164165852792329 Test Accuracy: 0.7607076350093109
Total Epochs: 8, Train Accuracy: 0.7160503418272218 Test Accuracy: 0.7700186219739292
Total Epochs: 9, Train Accuracy: 0.7400041433602652 Test Accuracy: 0.7625698324022346
Total Epochs: 10, Train Accuracy: 0.7456183965195774 Test Accuracy: 0.7830540037243948
Total Epochs: 11, Train Accuracy: 0.7797615684501948 Test Accuracy: 0.813780260707635
Total Epochs: 12, Train Accuracy: 0.7784856018230785 Test

## Architecture 3
### SetFitClassifier transfer learning and few shot prediction

In [None]:
from pathlib import Path
import random
from collections import defaultdict
from itertools import chain, groupby
from typing import Any, List, Optional, Union

import joblib
import numpy as np
import torch
from sentence_transformers import InputExample, SentenceTransformer, losses
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader

StrOrPath = Union[Path, str]


def generate_sentence_pair_batch(
    sentences: List[str], labels: List[float]
) -> List[InputExample]:
    # 7x faster than original implementation on small data,
    # 14x faster on 10000 examples
    pairs = []
    sent_lookup = defaultdict(list)
    single_example = {}
    for label, grouper in groupby(
        ((s, l) for s, l in zip(sentences, labels)), key=lambda x: x[1]
    ):
        sent_lookup[label].extend(list(i[0] for i in grouper))
        single_example[label] = len(sent_lookup[label]) == 1
    neg_lookup = {}
    for current_label in sent_lookup:
        negative_options = list(
            chain.from_iterable(
                [
                    sentences
                    for label, sentences in sent_lookup.items()
                    if label != current_label
                ]
            )
        )
        neg_lookup[current_label] = negative_options

    for current_sentence, current_label in zip(sentences, labels):
        positive_pair = random.choice(sent_lookup[current_label])
        if not single_example[current_label]:
            # choosing itself as a matched pair seems wrong,
            # but we need to account for the case of 1 positive example
            # so as long as there's not a single positive example,
            # we'll reselect the other item in the pair until it's different
            while positive_pair == current_sentence:
                positive_pair = random.choice(sent_lookup[current_label])

        negative_pair = random.choice(neg_lookup[current_label])
        pairs.append(InputExample(texts=[current_sentence, positive_pair], label=1.0))
        pairs.append(InputExample(texts=[current_sentence, negative_pair], label=0.0))

    return pairs


def generate_multiple_sentence_pairs(
    sentences: List[str], labels: List[float], iter: int = 1
):
    all_pairs = []
    for _ in range(iter):
        all_pairs.extend(generate_sentence_pair_batch(sentences, labels))
    return all_pairs


class SetFitClassifier(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        model: str,
        classifier_head: Optional[Any] = None,
        loss=losses.CosineSimilarityLoss,
        random_state: int = 1234,
    ):
        random.seed(random_state)
        np.random.seed(random_state)
        torch.manual_seed(random_state)
        self.random_state = random_state
        self.model = SentenceTransformer(model)
        if classifier_head is None:
            self.classifier_head = LogisticRegression()
        else:
            self.classifier_head = classifier_head()
        self.loss = loss(self.model)
        self.fitted = False

    def fit(
        self,
        X,
        y,
        data_iter: int = 5,
        train_iter: int = 1,
        batch_size: int = 16,
        warmup_steps: int = 10,
        show_progress_bar: bool = True,
    ):
        train_examples = generate_multiple_sentence_pairs(X, y, data_iter)
        train_dataloader = DataLoader(
            train_examples,
            shuffle=True,
            batch_size=batch_size,
            generator=torch.Generator(device=self.model.device),
        )
        self.model.fit(
            train_objectives=[(train_dataloader, self.loss)],
            epochs=train_iter,
            warmup_steps=warmup_steps,
            show_progress_bar=show_progress_bar,
        )

        X_train = self.model.encode(X)
        self.classifier_head.fit(X_train, y)
        self.fitted = True

    def predict(self, X, y=None):
        if not self.fitted:
            raise NotFittedError(
                "This SetFitClassifier instance is not fitted yet."
                " Call 'fit' with appropriate arguments before using this estimator."
            )
        X_embed = self.model.encode(X)
        preds = self.classifier_head.predict(X_embed)
        return preds

    def predict_proba(self, X, y=None):
        if not self.fitted:
            raise NotFittedError(
                "This SetFitClassifier instance is not fitted yet."
                " Call 'fit' with appropriate arguments before using this estimator."
            )
        X_embed = self.model.encode(X)
        preds = self.classifier_head.predict_proba(X_embed)
        return preds

    def save(
        self,
        path: StrOrPath,
        model_name: Optional[str] = None,
        create_model_card: bool = False,
    ):
        if not self.fitted:
            raise NotFittedError(
                "This SetFitClassifier instance is not fitted yet."
                " Call 'fit' with appropriate arguments before saving this estimator."
            )
        self.model.save(str(path), model_name, create_model_card)
        joblib.dump(self.classifier_head, Path(path) / "classifier.pkl")

    @classmethod
    def load(cls, path: StrOrPath):
        setfit = SetFitClassifier(str(path))
        setfit.classifier_head = joblib.load(Path(path) / "classifier.pkl")
        setfit.fitted = True
        return setfit