### Libraries

%pip install gensim nltk pandas

Code based on https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import collections


# 1. Preprocess the data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    dev_claims = json.load(f)

In [3]:
# Lowercasing the 'claim_text' field for each claim
for claim_id, claim_info in claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

for claim_id, claim_info in dev_claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)

In [5]:
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [6]:
print("Number of claims for training = {0}".format(len(claims)))
print("Number of claims for development = {0}".format(len(dev_claims)))
print("Number of evidences = {0}".format(len(evidences)))

Number of claims for training = 1228
Number of claims for development = 154
Number of evidences = 1208827


### Prepare the corpus

In [7]:
# Collect all texts from claims
corpus = {}

for id, claim in claims.items():
#    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

    for evidence in claim['evidences']:
        text = claim['claim_text'] + " " + evidences[evidence]
        corpus[id + ' - ' + evidence] = str.strip(text)

#for id, evidence in evidences.items():
#    corpus[id] = str.strip(evidence) # Add evidence text

In [8]:
# Collect all texts from claims
#corpus = {}
#for id, claim in claims.items():
#    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

#for id, evidence in evidences.items():
#    corpus[id] = str.strip(evidence) # Add evidence text

In [9]:
def tokenize_text(df, column):
    df['tokens'] = df[column].apply(lambda x: [token for token in word_tokenize(x) if token.isalnum()])
    return df

In [10]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text'])


In [None]:
df

In [11]:
df = tokenize_text(df,'text')
df

Unnamed: 0,text,tokens
claim-1937 - evidence-442946,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-1937 - evidence-1194317,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-1937 - evidence-12171,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
claim-126 - evidence-338219,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t..."
claim-126 - evidence-1127398,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t..."
...,...,...
claim-502 - evidence-583187,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru..."
claim-3093 - evidence-971105,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."
claim-3093 - evidence-457769,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."
claim-3093 - evidence-298971,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a..."


In [12]:
# Define a function to be applied to each row
def process_row(row, index):
    return TaggedDocument(row['tokens'], tags=[index])

df['tagged'] = df.apply(lambda row: process_row(row, row.name), axis=1)
df

Unnamed: 0,text,tokens,tagged
claim-1937 - evidence-442946,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-1194317,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-1937 - evidence-12171,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...","([not, only, is, there, no, scientific, eviden..."
claim-126 - evidence-338219,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."
claim-126 - evidence-1127398,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...","([el, niño, drove, record, highs, in, global, ..."
...,...,...,...
claim-502 - evidence-583187,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru...","([but, abnormal, temperature, spikes, in, febr..."
claim-3093 - evidence-971105,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."
claim-3093 - evidence-457769,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."
claim-3093 - evidence-298971,sending oscillating microwaves from an antenna...,"[sending, oscillating, microwaves, from, an, a...","([sending, oscillating, microwaves, from, an, ..."


In [13]:
train_corpus = df.tagged.values
train_corpus

array([TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'at', 'very', 'high', 'concentrations', '100', 'times', 'atmospheric', 'concentration', 'or', 'greater', 'carbon', 'dioxide', 'can', 'be', 'toxic', 'to', 'animal', 'life', 'so', 'raising', 'the', 'concentration', 'to', 'ppm', '1', 'or', 'higher', 'for', 'several', 'hours', 'will', 'eliminate', 'pests', 'such', 'as', 'whiteflies', 'and', 'spider', 'mites', 'in', 'a', 'greenhouse'], tags=['claim-1937 - evidence-442946']),
       TaggedDocument(words=['not', 'only', 'is', 'there', 'no', 'scientific', 'evidence', 'that', 'co2', 'is', 'a', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'more', 'plant', 'and', 'animal', 'life', 'plants', 'can', 'grow', 'as', 'much', 'as', '50', 'percent', 'faster', 

## Train Model
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [14]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=50)

In [15]:
model.build_vocab(train_corpus)

In [16]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [17]:
model.save("Doc2Vec.model")

In [18]:
#model.load("Doc2Vec.model")

## Assesing the model

In [19]:
evidences_df = pd.DataFrame.from_dict(evidences, orient='index', columns=['text'])
evidences_df = tokenize_text(evidences_df,'text')
evidences_df['inferred'] = evidences_df['tokens'].apply(lambda x: model.infer_vector(x))
evidences_df

Unnamed: 0,text,tokens,inferred
evidence-0,"john bennet lawes, english entrepreneur and ag...","[john, bennet, lawes, english, entrepreneur, a...","[-0.14050333, -0.07346727, 0.15199955, -0.2144..."
evidence-1,lindberg began his professional career at the ...,"[lindberg, began, his, professional, career, a...","[0.061883505, -0.32056716, -0.28470227, -0.651..."
evidence-2,``boston (ladies of cambridge)'' by vampire we...,"[boston, ladies, of, cambridge, by, vampire, w...","[-0.027510643, -0.06301287, 0.11331111, 0.0033..."
evidence-3,"gerald francis goyer (born october 20, 1936) w...","[gerald, francis, goyer, born, october, 20, 19...","[0.23856257, 0.24009803, -0.12047614, 0.009722..."
evidence-4,he detected abnormalities of oxytocinergic fun...,"[he, detected, abnormalities, of, oxytocinergi...","[-0.036221355, -0.044179935, 0.06933342, -0.09..."
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...","[0.021422707, 0.095737614, 0.34775507, 0.12257..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, 6110, volda]","[-0.105602525, 0.076353885, 0.02551154, 0.0276..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...","[-0.1008926, 0.023352848, 0.037718453, -0.2017..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...","[0.40652102, -0.4581477, -0.46946862, 0.645823..."


In [20]:
claims_df = pd.DataFrame.from_dict(claims, orient='index')
claims_df = tokenize_text(claims_df,'claim_text')
claims_df['inferred'] = claims_df['tokens'].apply(lambda x: model.infer_vector(x))
claims_df

Unnamed: 0,claim_text,claim_label,evidences,tokens,inferred
claim-1937,not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1...","[not, only, is, there, no, scientific, evidenc...","[0.17607394, 0.08539891, 0.4827279, 0.00881600..."
claim-126,el niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]","[el, niño, drove, record, highs, in, global, t...","[0.030782836, 0.3517376, 0.13674931, 0.2072547..."
claim-2510,"in 1946, pdo switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]","[in, 1946, pdo, switched, to, a, cool, phase]","[-0.08643206, -0.06467303, 0.009351828, 0.0752..."
claim-2021,weather channel co-founder john coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5...","[weather, channel, john, coleman, provided, ev...","[0.017568791, 0.014454866, -0.032761347, 0.010..."
claim-2449,"""january 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72...","[january, 2008, capped, a, 12, month, period, ...","[-0.014964191, 0.15340918, 0.04626587, 0.05688..."
...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,SUPPORTS,"[evidence-1055682, evidence-1047356, evidence-...","[climate, scientists, say, that, aspects, of, ...","[-0.21888562, 0.056726113, -0.034341607, -0.06..."
claim-243,"in its 5th assessment report in 2013, the ipcc...",SUPPORTS,[evidence-916755],"[in, its, 5th, assessment, report, in, 2013, t...","[-0.26896146, -0.03710625, -0.008333473, 0.418..."
claim-2302,"since the mid 1970s, global temperatures have ...",NOT_ENOUGH_INFO,"[evidence-403673, evidence-889933, evidence-11...","[since, the, mid, 1970s, global, temperatures,...","[-0.30831116, 0.1582826, -0.27701002, 0.080839..."
claim-502,but abnormal temperature spikes in february an...,NOT_ENOUGH_INFO,"[evidence-97375, evidence-562427, evidence-521...","[but, abnormal, temperature, spikes, in, febru...","[-0.29489544, 0.26832366, -0.5699372, 0.055965..."


In [24]:
def get_distance(vector1, vector2):
    similarity = Doc2Vec.similarity_unseen_docs(model, vector1, vector2)
    return similarity

In [26]:
distances = {}
for index, row in claims_df.iterrows():
    vector1 = row['tokens']
    distances[index] = []
    for index2, row2 in evidences_df.iterrows():
        vector2 = row2['tokens']
        sim = get_distance(vector1, vector2)
        if sim > 0.5:
            distances[index].append((index,sim))

KeyboardInterrupt: 

In [None]:
def get_top10_rank(inferred_vector):
    similarity_vector = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    return similarity_vector[0:10]

In [None]:
def filter_and_predict(sims):
    filtered_list = [item for item in sims if not item[0].startswith('claim')]
    prediction = [sim[0] for sim in filtered_list if sim[1] > 0.5 and sim[0]]
    return prediction

In [None]:

train_df['sims'] = train_df['inferred'].apply(lambda x: get_top10_rank(x))
train_df['predictions'] = train_df['sims'].apply(filter_and_predict)
train_df

In [None]:
dev_df = pd.DataFrame.from_dict(dev_claims, orient='index')
dev_df = tokenize_text(dev_df,'claim_text')
dev_df['inferred'] = dev_df['tokens'].apply(lambda x: model.infer_vector(x))
dev_df['sims'] = dev_df['inferred'].apply(lambda x: get_top10_rank(x))
dev_df['predictions'] = dev_df['sims'].apply(filter_and_predict)
dev_df

In [None]:
# Save the DataFrame to a Pickle file
train_df.to_pickle('dfInferred.pkl')
# Restore the DataFrame from the Pickle file
#train_df = pd.read_pickle('dfInferred.pkl')
train_df

In [None]:
# Save the DataFrame to a Pickle file
dev_df.to_pickle('devdfInferred.pkl')
# Restore the DataFrame from the Pickle file
#dev_df = pd.read_pickle('devdfInferred.pkl')
dev_df

In [None]:
train_df_doc2vec = train_df.copy()
train_df_doc2vec = train_df_doc2vec.drop('evidences', axis=1)
train_df_doc2vec = train_df_doc2vec.drop('tokens', axis=1)
train_df_doc2vec = train_df_doc2vec.drop('inferred', axis=1)
train_df_doc2vec = train_df_doc2vec.drop('sims', axis=1)
train_df_doc2vec = train_df_doc2vec.rename(columns={'predictions': 'evidences'})
train_df_doc2vec

In [None]:
dev_df_doc2vec = dev_df.copy()
dev_df_doc2vec = dev_df_doc2vec.drop('evidences', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('tokens', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('inferred', axis=1)
dev_df_doc2vec = dev_df_doc2vec.drop('sims', axis=1)
dev_df_doc2vec = dev_df_doc2vec.rename(columns={'predictions': 'evidences'})
dev_df_doc2vec

In [None]:
# Export the DataFrame to a JSON file
train_df_doc2vec.to_json('../data/train_claims_doc2vec.json', orient='index')
dev_df_doc2vec.to_json('../data/dev_claims_doc2vec.json', orient='index')