### Libraries

%pip install gensim nltk pandas sklearn torch rank_bm25

Code based on https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

In [1]:
import json
import pandas as pd
import numpy as np
import multiprocessing
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

import nltk
from nltk.tokenize import word_tokenize

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
import collections

from rank_bm25 import BM25Okapi

# 1. Preprocess the data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    dev_claims = json.load(f)

In [3]:
# Lowercasing the 'claim_text' field for each claim
for claim_id, claim_info in claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

for claim_id, claim_info in dev_claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)

In [5]:
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [6]:
print("Number of claims for training = {0}".format(len(claims)))
print("Number of claims for development = {0}".format(len(dev_claims)))
print("Number of evidences = {0}".format(len(evidences)))

Number of claims for training = 1228
Number of claims for development = 154
Number of evidences = 1208827


## Prepare the corpus

### Second approach

In [7]:
# Collect all texts from claims
corpus = {}

for id, claim in claims.items():
    # Create pairs claim + evidence
    for evidence in claim['evidences']:
        text = claim['claim_text'] + " " + evidences[evidence]
        corpus[id + ' - ' + evidence] = (str.strip(text),claim['claim_label'])

### First approach

In [8]:
# Train the model in claims and evidences
# Collect all texts from claims
#corpus = {}
#for id, claim in claims.items():
#    corpus[id] = str.strip(claim['claim_text'])  # Add claim text
#for id, evidence in evidences.items():
#    corpus[id] = str.strip(evidence) # Add evidence text

In [9]:
def tokenize_text(df, column):
    df['tokens'] = df[column].apply(lambda x: [token for token in word_tokenize(x) if token.isalnum()])
    return df

In [10]:
def process_row(row, index):
    return TaggedDocument(row['tokens'], tags=[row[index]])

In [11]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text','label'])
df = tokenize_text(df,'text')
df['tagged'] = df.apply(lambda row: process_row(row, 'label'), axis=1)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 4122 entries, claim-1937 - evidence-442946 to claim-3093 - evidence-883158
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4122 non-null   object
 1   label   4122 non-null   object
 2   tokens  4122 non-null   object
 3   tagged  4122 non-null   object
dtypes: object(4)
memory usage: 161.0+ KB


Unnamed: 0,text,label,tokens,tagged
claim-1937 - evidence-442946,not only is there no scientific evidence that ...,DISPUTED,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-1194317,not only is there no scientific evidence that ...,DISPUTED,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-12171,not only is there no scientific evidence that ...,DISPUTED,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-126 - evidence-338219,el niño drove record highs in global temperatu...,REFUTES,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."
claim-126 - evidence-1127398,el niño drove record highs in global temperatu...,REFUTES,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."


In [12]:
train_corpus = df.tagged.values
del df
train_corpus

array([TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'at', 'very', 'high', 'concentrations', '100', 'times', 'atmospheric', 'concentration', 'or', 'greater', 'carbon', 'dioxide', 'can', 'be', 'toxic', 'to', 'animal', 'life', 'so', 'raising', 'the', 'concentration', 'to', 'ppm', '1', 'or', 'higher', 'for', 'several', 'hours', 'will', 'eliminate', 'pests', 'such', 'as', 'whiteflies', 'and', 'spider', 'mites', 'in', 'a', 'greenhouse'], tags=['DISPUTED']),
       TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'plants', 'can', 'grow', 'as', 'much', 'as', '50', 'percent', 'faster', 'in', 'concentration

In [13]:
# Collect all texts from claims
corpus = {}

for id, claim in dev_claims.items():
    # Create pairs claim + evidence
    for evidence in claim['evidences']:
        text = claim['claim_text'] + " " + evidences[evidence]
        corpus[id + ' - ' + evidence] = (str.strip(text),claim['claim_label'])

In [14]:
# Collect all texts from dev claims
dev_df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text','label'])
dev_df = tokenize_text(dev_df,'text')
dev_df['tagged'] = dev_df.apply(lambda row: process_row(row, 'label'), axis=1)
dev_df.info()
dev_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 491 entries, claim-752 - evidence-67732 to claim-1021 - evidence-1175280
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    491 non-null    object
 1   label   491 non-null    object
 2   tokens  491 non-null    object
 3   tagged  491 non-null    object
dtypes: object(4)
memory usage: 19.2+ KB


Unnamed: 0,text,label,tokens,tagged
claim-752 - evidence-67732,[south australia] has the most expensive elect...,SUPPORTS,"[south, australia, has, the, most, expensive, ...","([south, australia, has, the, most, expensive,..."
claim-752 - evidence-572512,[south australia] has the most expensive elect...,SUPPORTS,"[south, australia, has, the, most, expensive, ...","([south, australia, has, the, most, expensive,..."
claim-375 - evidence-996421,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[when, 3, per, cent, of, total, annual, global...","([when, 3, per, cent, of, total, annual, globa..."
claim-375 - evidence-1080858,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[when, 3, per, cent, of, total, annual, global...","([when, 3, per, cent, of, total, annual, globa..."
claim-375 - evidence-208053,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[when, 3, per, cent, of, total, annual, global...","([when, 3, per, cent, of, total, annual, globa..."


In [15]:
dev_corpus = dev_df.tagged.values
del dev_df
dev_corpus[0:5]

array([TaggedDocument(words=['south', 'australia', 'has', 'the', 'most', 'expensive', 'electricity', 'in', 'the', 'world', 'citation', 'needed', 'south', 'australia', 'has', 'the', 'highest', 'retail', 'price', 'for', 'electricity', 'in', 'the', 'country'], tags=['SUPPORTS']),
       TaggedDocument(words=['south', 'australia', 'has', 'the', 'most', 'expensive', 'electricity', 'in', 'the', 'world', 'south', 'australia', 'has', 'the', 'highest', 'power', 'prices', 'in', 'the', 'world'], tags=['SUPPORTS']),
       TaggedDocument(words=['when', '3', 'per', 'cent', 'of', 'total', 'annual', 'global', 'emissions', 'of', 'carbon', 'dioxide', 'are', 'from', 'humans', 'and', 'australia', 'per', 'cent', 'of', 'this', '3', 'per', 'cent', 'then', 'no', 'amount', 'of', 'emissions', 'here', 'will', 'have', 'any', 'effect', 'on', 'global', 'climate', 'the', '2011', 'unep', 'green', 'economy', 'report', 'states', 'that', 'a', 'agricultural', 'operations', 'excluding', 'land', 'use', 'changes', 'produce

## Train Model
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [16]:
cores = multiprocessing.cpu_count()
cores

20

In [17]:
# Train Doc2Vec model
model = Doc2Vec(dm=1, vector_size=50, window=5, min_count=1, workers=cores, epochs=50)

In [18]:
model.build_vocab(train_corpus)

In [19]:
from datetime import datetime

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0
        self.last_signal = datetime.now()

    def on_epoch_end(self, model):
        t = datetime.now() - self.last_signal
        print("Epoch #{} - Duration: {}".format(self.epoch, t))
        self.epoch += 1
        self.last_signal = datetime.now()

In [20]:
epoch_logger = EpochLogger()
model.train(train_corpus, total_examples=model.corpus_count,  epochs=model.epochs, callbacks=[epoch_logger])

Epoch #0 - Duration: 0:00:00.423078
Epoch #1 - Duration: 0:00:00.455902
Epoch #2 - Duration: 0:00:00.433977
Epoch #3 - Duration: 0:00:00.436833
Epoch #4 - Duration: 0:00:00.417160
Epoch #5 - Duration: 0:00:00.422769
Epoch #6 - Duration: 0:00:00.423467
Epoch #7 - Duration: 0:00:00.426159
Epoch #8 - Duration: 0:00:00.417526
Epoch #9 - Duration: 0:00:00.421000
Epoch #10 - Duration: 0:00:00.431445
Epoch #11 - Duration: 0:00:00.412789
Epoch #12 - Duration: 0:00:00.427250
Epoch #13 - Duration: 0:00:00.440512
Epoch #14 - Duration: 0:00:00.430008
Epoch #15 - Duration: 0:00:00.423950
Epoch #16 - Duration: 0:00:00.423063
Epoch #17 - Duration: 0:00:00.425558
Epoch #18 - Duration: 0:00:00.423047
Epoch #19 - Duration: 0:00:00.482626
Epoch #20 - Duration: 0:00:00.414231
Epoch #21 - Duration: 0:00:00.413847
Epoch #22 - Duration: 0:00:00.431119
Epoch #23 - Duration: 0:00:00.415751
Epoch #24 - Duration: 0:00:00.411826
Epoch #25 - Duration: 0:00:00.418572
Epoch #26 - Duration: 0:00:00.419590
Epoch #27 -

In [21]:
model.save("Doc2Vec.model")

In [22]:
#model.load("Doc2Vec.model")

## Assesing the model

In [23]:
evidences_df = pd.DataFrame.from_dict(evidences, orient='index', columns=['text'])
evidences_df = tokenize_text(evidences_df,'text')
evidences_df['inferred'] = evidences_df['tokens'].apply(lambda x: model.infer_vector(x))
evidences_df

Unnamed: 0,text,tokens,inferred
evidence-0,"john bennet lawes, english entrepreneur and ag...","[john, bennet, lawes, english, entrepreneur, a...","[-0.23991504, 0.51219636, 0.05094586, 0.195069..."
evidence-1,lindberg began his professional career at the ...,"[lindberg, began, his, professional, career, a...","[-1.3084338, -0.12524424, -0.95123047, 0.55854..."
evidence-2,``boston (ladies of cambridge)'' by vampire we...,"[boston, ladies, of, cambridge, by, vampire, w...","[-0.2850507, -0.10411729, -0.35949567, 0.23621..."
evidence-3,"gerald francis goyer (born october 20, 1936) w...","[gerald, francis, goyer, born, october, 20, 19...","[-0.5594862, -0.7934478, 0.57019496, -0.384542..."
evidence-4,he detected abnormalities of oxytocinergic fun...,"[he, detected, abnormalities, of, oxytocinergi...","[-0.04946132, 0.110815756, -0.44829205, -0.211..."
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...","[0.0019220194, 0.084965706, -0.4861797, 0.3197..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, 6110, volda]","[0.0010043998, -0.026967091, -0.020551557, 0.1..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...","[-0.3721467, 0.26059905, 0.22694564, -0.447193..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...","[0.24094175, -0.4339357, 0.10777598, 0.4715772..."


In [33]:
evidences_df.to_pickle('evidences_df.pkl')
#evidences_df = pd.read_pickle('evidences_df.pkl')  

In [24]:
claims_df = pd.DataFrame.from_dict(claims, orient='index')
claims_df = tokenize_text(claims_df,'claim_text')
claims_df['inferred'] = claims_df['tokens'].apply(lambda x: model.infer_vector(x))
claims_df

Unnamed: 0,claim_text,claim_label,evidences,tokens,inferred
claim-1937,not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1...","[not, only, is, there, no, scientific, evidenc...","[-0.80932164, -0.08572763, -0.3182288, -0.2329..."
claim-126,el niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]","[el, niño, drove, record, highs, in, global, t...","[-0.3487168, -0.09100804, -0.65324616, -0.2439..."
claim-2510,"in 1946, pdo switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]","[in, 1946, pdo, switched, to, a, cool, phase]","[-0.11134084, -0.29519156, -0.43432373, 0.0635..."
claim-2021,weather channel co-founder john coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5...","[weather, channel, john, coleman, provided, ev...","[-0.12073334, -0.11993397, -0.32204187, 0.2162..."
claim-2449,"""january 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72...","[january, 2008, capped, a, 12, month, period, ...","[-0.8894794, 0.098310165, -0.9383252, 0.193941..."
...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,SUPPORTS,"[evidence-1055682, evidence-1047356, evidence-...","[climate, scientists, say, that, aspects, of, ...","[-0.3415583, -0.087080225, -0.3509223, 0.34327..."
claim-243,"in its 5th assessment report in 2013, the ipcc...",SUPPORTS,[evidence-916755],"[in, its, 5th, assessment, report, in, 2013, t...","[0.21890192, 0.038463153, -1.2138172, -0.93325..."
claim-2302,"since the mid 1970s, global temperatures have ...",NOT_ENOUGH_INFO,"[evidence-403673, evidence-889933, evidence-11...","[since, the, mid, 1970s, global, temperatures,...","[-0.8260765, -0.086758286, -0.24591607, 0.3731..."
claim-502,but abnormal temperature spikes in february an...,NOT_ENOUGH_INFO,"[evidence-97375, evidence-562427, evidence-521...","[but, abnormal, temperature, spikes, in, febru...","[-1.3430239, -0.19148764, -0.87164366, 0.09012..."


In [None]:
claims_df.to_pickle('claims_df.pkl')
#claims_df = pd.read_pickle('claims_df.pkl')  

### Implement BM25 Retrieval

In [25]:
# Initialize BM25 model
bm25 = BM25Okapi(evidences_df['tokens'].tolist())

In [26]:
# Calculate BM25 scores for each claim
def calculate_bm25_scores(query_tokens):
    return bm25.get_scores(query_tokens)

In [27]:
print("Generating the BM25 scores")
# Compute BM25 scores
#bm25_scores = claims_df['tokens'].apply(calculate_bm25_scores)

Generating the BM25 scores


In [28]:
# Extract lists
claim_vectors = claims_df['inferred'].to_list()
evidence_vectors = evidences_df['inferred'].to_list()

print("Generating the similarities")
# Calculate Doc2Vec similarities
doc2vec_similarities = cosine_similarity(claim_vectors, evidence_vectors)

Generating the similarities


In [29]:
#bm25_scores.to_pickle('bm25_scores.pkl')
bm25_scores = pd.read_pickle('bm25_scores.pkl')  

In [30]:
with open('doc2vec_similarities.pkl','wb') as f: pickle.dump(doc2vec_similarities, f)
#with open('doc2vec_similarities.pkl','rb') as f: doc2vec_similarities = pickle.load(f)

In [31]:
# Normalize the lengths
def normalize(scores):
    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

In [32]:
document_lengths = [len(doc) for doc in evidences_df['tokens']]
normalized_lengths = normalize(document_lengths)
p_normalized_lengths = 0.2 * normalized_lengths

In [None]:
n_bm25_scores = [normalize(doc) for doc in bm25_scores]

In [None]:
n_doc2vec_similarities = normalize(doc2vec_similarities)

In [None]:
p_bm25_scores = [0.4 * doc for doc in n_bm25_scores]

In [None]:
p_doc2vec_similarities = [0.4 * doc for doc in n_doc2vec_similarities]

In [None]:
del document_lengths, normalized_lengths
del bm25, n_bm25_scores
del doc2vec_similarities, n_doc2vec_similarities

In [None]:
len(p_normalized_lengths)

In [None]:
combined_similarities = [(sim + p_normalized_lengths) for sim in p_doc2vec_similarities]

In [None]:
# Combine scores with weights
combined_scores = p_bm25_scores + p_doc2vec_similarities + p_normalized_lengths

In [None]:
# Rank evidences
ranked_indices = np.argsort(combined_scores)[::-1]

In [None]:

# Prepare to collect top values, their indices, names
top_values_per_column = []
top_indices_per_column = []
top_evidence_names_per_column = []


for i in range(similarity_matrix.shape[1]):  # Iterate over columns
    column = similarity_matrix[:, i]
    indices = np.argpartition(column, -5)[-5:]
    sorted_indices = indices[np.argsort(column[indices])][::-1]  
    sorted_indices = sorted_indices.astype(int)
    evidences = []
    for index in sorted_indices:
        evidences.append(evidences_df.index[index])
    
    top_indices_per_column.append(sorted_indices)
    top_values_per_column.append(column[sorted_indices])
    top_evidence_names_per_column.append(evidences)

In [None]:
# Display the results
predictions = {}

for i, (values, indices, names) in enumerate(zip(top_values_per_column, top_indices_per_column, top_evidence_names_per_column)):
    predictions[claims_df.index[i]] = names

doc2vec = pd.DataFrame.from_dict(predictions, orient='index')
doc2vec['predictions'] = doc2vec[[0, 1, 2, 3, 4]].agg(list, axis=1)

train_df_doc2vec = claims_df.copy()
train_df_doc2vec = train_df_doc2vec.drop('evidences', axis=1)
train_df_doc2vec = train_df_doc2vec.drop('tokens', axis=1)
train_df_doc2vec = train_df_doc2vec.drop('inferred', axis=1)
train_df_doc2vec['predictions'] = doc2vec['predictions']
train_df_doc2vec = train_df_doc2vec.rename(columns={'predictions': 'evidences'})

# Save the DataFrame to a Pickle file
train_df_doc2vec.to_pickle('dfDoc2VecPredictions.pkl')
train_df_doc2vec.head()

In [None]:
# Export the DataFrame to a JSON file
train_df_doc2vec.to_json('../data/train_claims_doc2vec.json', orient='index')

In [None]:
dev_df = pd.DataFrame.from_dict(dev_claims, orient='index')
dev_df = tokenize_text(dev_df,'claim_text')
dev_df['inferred'] = dev_df['tokens'].apply(lambda x: model.infer_vector(x))
dev_df

In [None]:
claim_dev_vectors = dev_df['inferred'].to_list()
similarity_dev_matrix = cosine_similarity(evidence_vectors, claim_dev_vectors)

In [None]:
similarity_dev_matrix

In [None]:
# Prepare to collect top values, their indices, names
dev_top_values_per_column = []
dev_top_indices_per_column = []
dev_top_evidence_names_per_column = []


for i in range(similarity_dev_matrix.shape[1]):  # Iterate over columns
    column = similarity_dev_matrix[:, i]
    indices = np.argpartition(column, -5)[-5:]
    sorted_indices = indices[np.argsort(column[indices])][::-1]  
    sorted_indices = sorted_indices.astype(int)
    evidences = []
    for index in sorted_indices:
        evidences.append(evidences_df.index[index])
    
    dev_top_indices_per_column.append(sorted_indices)
    dev_top_values_per_column.append(column[sorted_indices])
    dev_top_evidence_names_per_column.append(evidences)

In [None]:
# Display the results
dev_predictions = {}

for i, (values, indices, names) in enumerate(zip(dev_top_values_per_column, dev_top_indices_per_column, dev_top_evidence_names_per_column)):
    dev_predictions[dev_df.index[i]] = names

dev_doc2vec = pd.DataFrame.from_dict(dev_predictions, orient='index')
dev_doc2vec['predictions'] = dev_doc2vec[[0, 1, 2, 3, 4]].agg(list, axis=1)

dev_df_doc2vec = dev_df.copy()
dev_df_doc2vec = dev_df_doc2vec.drop('tokens', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('inferred', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('evidences', axis=1)
dev_df_doc2vec['predictions'] = dev_doc2vec['predictions']
dev_df_doc2vec = dev_df_doc2vec.rename(columns={'predictions': 'evidences'})

# Save the DataFrame to a Pickle file
dev_df_doc2vec.to_pickle('dfdevDoc2VecDevPredictions.pkl')
dev_df_doc2vec.head()

In [None]:
# Export the DataFrame to a JSON file
dev_df_doc2vec.to_json('../data/dev_claims_doc2vec.json', orient='index')

## Classification

In [None]:
def vec_for_learning(model, sents):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [None]:
logreg = LogisticRegression(n_jobs=cores, C=1e5, max_iter=200)

In [None]:
y_train, X_train = vec_for_learning(model, train_corpus)

In [None]:
y_dev, X_dev = vec_for_learning(model, dev_corpus)

In [None]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_dev)
print('Testing accuracy %s' % accuracy_score(y_dev, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_dev, y_pred, average='weighted')))
report = classification_report(y_dev, y_pred)
print(f"Classification Report:\n{report}")

In [None]:
# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    dev_claims = json.load(f)

In [None]:
import gc
gc.collect()