### Libraries

%pip install gensim nltk pandas sklearn torch

Code based on https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import collections


# 1. Preprocess the data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    dev_claims = json.load(f)

In [3]:
# Lowercasing the 'claim_text' field for each claim
for claim_id, claim_info in claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

for claim_id, claim_info in dev_claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)

In [5]:
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [6]:
print("Number of claims for training = {0}".format(len(claims)))
print("Number of claims for development = {0}".format(len(dev_claims)))
print("Number of evidences = {0}".format(len(evidences)))

Number of claims for training = 1228
Number of claims for development = 154
Number of evidences = 1208827


### Prepare the corpus

In [7]:
# Collect all texts from claims
corpus = {}

for id, claim in claims.items():
#    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

    for evidence in claim['evidences']:
        text = claim['claim_text'] + " " + evidences[evidence]
        corpus[id + ' - ' + evidence] = str.strip(text)

#for id, evidence in evidences.items():
#    corpus[id] = str.strip(evidence) # Add evidence text

In [8]:
# Collect all texts from claims
#corpus = {}
#for id, claim in claims.items():
#    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

#for id, evidence in evidences.items():
#    corpus[id] = str.strip(evidence) # Add evidence text

In [9]:
def tokenize_text(df, column):
    df['tokens'] = df[column].apply(lambda x: [token for token in word_tokenize(x) if token.isalnum()])
    return df

In [10]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text'])


In [None]:
df

In [11]:
df = tokenize_text(df,'text')
df

Unnamed: 0,text,tokens
claim-1937 - evidence-442946,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-1937 - evidence-1194317,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-1937 - evidence-12171,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-126 - evidence-338219,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t..."
claim-126 - evidence-1127398,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t..."
...,...,...
claim-502 - evidence-583187,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru..."
claim-3093 - evidence-971105,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."
claim-3093 - evidence-457769,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."
claim-3093 - evidence-298971,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."


In [12]:
# Define a function to be applied to each row
def process_row(row, index):
    return TaggedDocument(row['tokens'], tags=[index])

df['tagged'] = df.apply(lambda row: process_row(row, row.name), axis=1)
df

Unnamed: 0,text,tokens,tagged
claim-1937 - evidence-442946,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-1194317,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-12171,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-126 - evidence-338219,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."
claim-126 - evidence-1127398,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."
...,...,...,...
claim-502 - evidence-583187,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru...","([but, abnormal, temperature, spikes, in, febr..."
claim-3093 - evidence-971105,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."
claim-3093 - evidence-457769,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."
claim-3093 - evidence-298971,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."


In [13]:
train_corpus = df.tagged.values
train_corpus

array([TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'at', 'very', 'high', 'concentrations', '100', 'times', 'atmospheric', 'concentration', 'or', 'greater', 'carbon', 'dioxide', 'can', 'be', 'toxic', 'to', 'animal', 'life', 'so', 'raising', 'the', 'concentration', 'to', 'ppm', '1', 'or', 'higher', 'for', 'several', 'hours', 'will', 'eliminate', 'pests', 'such', 'as', 'whiteflies', 'and', 'spider', 'mites', 'in', 'a', 'greenhouse'], tags=['claim-1937 - evidence-442946']),
       TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'plants', 'can', 'grow', 'as', 'much', 'as', '50', 'percent', 'faster', 

## Train Model
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [14]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=50)

In [15]:
model.build_vocab(train_corpus)

In [16]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [17]:
model.save("Doc2Vec.model")

In [18]:
#model.load("Doc2Vec.model")

## Assesing the model

In [19]:
evidences_df = pd.DataFrame.from_dict(evidences, orient='index', columns=['text'])
evidences_df = tokenize_text(evidences_df,'text')
evidences_df['inferred'] = evidences_df['tokens'].apply(lambda x: model.infer_vector(x))
evidences_df

Unnamed: 0,text,tokens,inferred
evidence-0,"john bennet lawes, english entrepreneur and ag...","[john, bennet, lawes, english, entrepreneur, a...","[-0.14050333, -0.07346727, 0.15199955, -0.2144..."
evidence-1,lindberg began his professional career at the ...,"[lindberg, began, his, professional, career, a...","[0.061883505, -0.32056716, -0.28470227, -0.651..."
evidence-2,``boston (ladies of cambridge)'' by vampire we...,"[boston, ladies, of, cambridge, by, vampire, w...","[-0.027510643, -0.06301287, 0.11331111, 0.0033..."
evidence-3,"gerald francis goyer (born october 20, 1936) w...","[gerald, francis, goyer, born, october, 20, 19...","[0.23856257, 0.24009803, -0.12047614, 0.009722..."
evidence-4,he detected abnormalities of oxytocinergic fun...,"[he, detected, abnormalities, of, oxytocinergi...","[-0.036221355, -0.044179935, 0.06933342, -0.09..."
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...","[0.021422707, 0.095737614, 0.34775507, 0.12257..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, 6110, volda]","[-0.105602525, 0.076353885, 0.02551154, 0.0276..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...","[-0.1008926, 0.023352848, 0.037718453, -0.2017..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...","[0.40652102, -0.4581477, -0.46946862, 0.645823..."


In [20]:
claims_df = pd.DataFrame.from_dict(claims, orient='index')
claims_df = tokenize_text(claims_df,'claim_text')
claims_df['inferred'] = claims_df['tokens'].apply(lambda x: model.infer_vector(x))
claims_df

Unnamed: 0,claim_text,claim_label,evidences,tokens,inferred
claim-1937,not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1...","[not, only, is, there, no, scientific, evidenc...","[0.17607394, 0.08539891, 0.4827279, 0.00881600..."
claim-126,el niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]","[el, niño, drove, record, highs, in, global, t...","[0.030782836, 0.3517376, 0.13674931, 0.2072547..."
claim-2510,"in 1946, pdo switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]","[in, 1946, pdo, switched, to, a, cool, phase]","[-0.08643206, -0.06467303, 0.009351828, 0.0752..."
claim-2021,weather channel co-founder john coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5...","[weather, channel, john, coleman, provided, ev...","[0.017568791, 0.014454866, -0.032761347, 0.010..."
claim-2449,"""january 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72...","[january, 2008, capped, a, 12, month, period, ...","[-0.014964191, 0.15340918, 0.04626587, 0.05688..."
...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,SUPPORTS,"[evidence-1055682, evidence-1047356, evidence-...","[climate, scientists, say, that, aspects, of, ...","[-0.21888562, 0.056726113, -0.034341607, -0.06..."
claim-243,"in its 5th assessment report in 2013, the ipcc...",SUPPORTS,[evidence-916755],"[in, its, 5th, assessment, report, in, 2013, t...","[-0.26896146, -0.03710625, -0.008333473, 0.418..."
claim-2302,"since the mid 1970s, global temperatures have ...",NOT_ENOUGH_INFO,"[evidence-403673, evidence-889933, evidence-11...","[since, the, mid, 1970s, global, temperatures,...","[-0.30831116, 0.1582826, -0.27701002, 0.080839..."
claim-502,but abnormal temperature spikes in february an...,NOT_ENOUGH_INFO,"[evidence-97375, evidence-562427, evidence-521...","[but, abnormal, temperature, spikes, in, febru...","[-0.29489544, 0.26832366, -0.5699372, 0.055965..."


In [None]:
claims_vectors = claims_df['inferred'].to_list()
evidence_vectors = evidences_df['inferred'].to_list()

In [71]:
from sklearn.metrics.pairwise import cosine_similarity

claim_vectors = claims_df['inferred'].to_list()
evidence_vectors = evidences_df['inferred'].to_list()

similarity_matrix = cosine_similarity(evidence_vectors, claim_vectors)


0.17030945

In [97]:
similarity_matrix

array([[ 0.17030945,  0.0625849 ,  0.20881778, ...,  0.22033504,
         0.08594144,  0.1297683 ],
       [ 0.06386488,  0.05623803,  0.15477315, ...,  0.1461761 ,
         0.23356791,  0.16277511],
       [ 0.2234163 ,  0.20351753,  0.3626231 , ...,  0.28136963,
         0.20387682,  0.4433491 ],
       ...,
       [ 0.57291985,  0.41554514,  0.61106503, ...,  0.5559787 ,
         0.47261038,  0.55696535],
       [-0.00474121, -0.18830982, -0.06577919, ..., -0.08228283,
         0.10299604,  0.03475001],
       [ 0.05257512, -0.12327001, -0.06539656, ..., -0.01830591,
         0.01258422, -0.07759824]], dtype=float32)

In [118]:
import numpy as np

# Prepare to collect top values, their indices, names
top_values_per_column = []
top_indices_per_column = []
top_evidence_names_per_column = []


for i in range(similarity_matrix.shape[1]):  # Iterate over columns
    column = similarity_matrix[:, i]
    indices = np.argpartition(column, -5)[-5:]
    sorted_indices = indices[np.argsort(column[indices])][::-1]  
    sorted_indices = sorted_indices.astype(int)
    evidences = []
    for index in sorted_indices:
        evidences.append(evidences_df.index[index])
    
    top_indices_per_column.append(sorted_indices)
    top_values_per_column.append(column[sorted_indices])
    top_evidence_names_per_column.append(evidences)

In [119]:
# Display the results
predictions = {}

for i, (values, indices, names) in enumerate(zip(top_values_per_column, top_indices_per_column, top_evidence_names_per_column)):
    predictions[claims_df.index[i]] = names

train_df_doc2vec = pd.DataFrame.from_dict(predictions, orient='index')
# Save the DataFrame to a Pickle file
train_df_doc2vec.to_pickle('dfInferred.pkl')
train_df_doc2vec

Unnamed: 0,0,1,2,3,4
claim-1937,evidence-1005071,evidence-848355,evidence-668884,evidence-760688,evidence-91099
claim-126,evidence-329503,evidence-1023546,evidence-1121828,evidence-716705,evidence-150709
claim-2510,evidence-520928,evidence-303820,evidence-848736,evidence-67392,evidence-1104308
claim-2021,evidence-55191,evidence-824530,evidence-393782,evidence-994549,evidence-199441
claim-2449,evidence-814790,evidence-118577,evidence-804415,evidence-223741,evidence-1126567
...,...,...,...,...,...
claim-1504,evidence-901265,evidence-106341,evidence-998010,evidence-824530,evidence-263479
claim-243,evidence-1029211,evidence-869387,evidence-868633,evidence-969251,evidence-1063869
claim-2302,evidence-525237,evidence-641721,evidence-1087907,evidence-188820,evidence-584356
claim-502,evidence-1159446,evidence-88825,evidence-95212,evidence-907574,evidence-247484


In [None]:
dev_df = pd.DataFrame.from_dict(dev_claims, orient='index')
dev_df = tokenize_text(dev_df,'claim_text')
dev_df['inferred'] = dev_df['tokens'].apply(lambda x: model.infer_vector(x))
dev_df

In [None]:
# Save the DataFrame to a Pickle file
dev_df.to_pickle('devdfInferred.pkl')
# Restore the DataFrame from the Pickle file
#dev_df = pd.read_pickle('devdfInferred.pkl')
dev_df

In [None]:
dev_df_doc2vec = dev_df.copy()
dev_df_doc2vec = dev_df_doc2vec.drop('evidences', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('tokens', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('inferred', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('sims', axis=1)
dev_df_doc2vec = dev_df_doc2vec.rename(columns={'predictions': 'evidences'})
dev_df_doc2vec

In [None]:
# Export the DataFrame to a JSON file
train_df_doc2vec.to_json('../data/train_claims_doc2vec.json', orient='index')
dev_df_doc2vec.to_json('../data/dev_claims_doc2vec.json', orient='index')