## Libraries

pip install gensim

pip install nltk

pip install pandas

In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import collections


## Loading Data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

In [3]:
# Lowercasing the 'claim_text' field for each claim
for claim_id, claim_info in claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

In [23]:
len(claims)

1228

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)


In [5]:
evidences = {i: str.lower(j) for i,j in evidences.items()}

### Prepare the corpus

In [6]:
# Collect all texts from claims
corpus = {}
for id, claim in claims.items():
    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

for id, evidence in evidences.items():
    corpus[id] = str.strip(evidence) # Add evidence text

In [7]:
def tokenize_text(df):
    df['tokens'] = df['text'].apply(lambda x: [token for token in word_tokenize(x) if token.isalpha()])
    df["length"] = df.tokens.apply(len)
    return df

In [8]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text'])

In [9]:
df = tokenize_text(df)
df

Unnamed: 0,text,tokens,length
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46


In [10]:
# Define a function to be applied to each row
def process_row(row, index):
    return TaggedDocument(row['tokens'], tags=[index])

df['tagged'] = df.apply(lambda row: process_row(row, row.name), axis=1)
df

Unnamed: 0,text,tokens,length,tagged
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa..."
...,...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9,"([also, on, the, property, is, a, contributing..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5,"([class, fn, org, fyrde, volda], [evidence-120..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9,"([dragon, storm, game, a, game, and, collectib..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46,"([it, states, that, the, zeriuani, which, is, ..."


In [11]:
train_corpus = df.tagged.values

## Train Model
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [12]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [13]:
model.build_vocab(train_corpus)

In [14]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [15]:
model.save("Doc2Vec.model")

In [None]:
model.load("Doc2Vec.model")

## Assesing the model

In [18]:
df['inferred'] = df['tokens'].apply(lambda x: model.infer_vector(x))
df

Unnamed: 0,text,tokens,length,tagged,inferred
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden...","[-0.5255818, 0.3639852, 0.792798, -0.28505632,..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ...","[0.37021023, -0.118777975, 0.11101906, -0.0199..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla...","[0.17473347, -0.37882805, 0.48910916, -0.17860..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e...","[-0.31965876, 0.44649655, -0.21100004, -0.8012..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa...","[0.14436851, 1.6839336, 0.17605224, -0.0226998..."
...,...,...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9,"([also, on, the, property, is, a, contributing...","[0.21325698, -0.16921909, 0.5638019, 0.3699309..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5,"([class, fn, org, fyrde, volda], [evidence-120...","[0.04556023, 0.088313326, 0.269229, 0.1356018,..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9,"([dragon, storm, game, a, game, and, collectib...","[-0.49490845, 0.24079002, 0.40730926, -0.332, ..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46,"([it, states, that, the, zeriuani, which, is, ...","[0.9109429, -1.4237438, -0.68798363, -0.434146..."


In [19]:
# Save the DataFrame to a Pickle file
df.to_pickle('dfInferred.pkl')

In [20]:
# Restore the DataFrame from the Pickle file
df = pd.read_pickle('dfInferred.pkl')

In [21]:
df

Unnamed: 0,text,tokens,length,tagged,inferred
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden...","[-0.5255818, 0.3639852, 0.792798, -0.28505632,..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ...","[0.37021023, -0.118777975, 0.11101906, -0.0199..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla...","[0.17473347, -0.37882805, 0.48910916, -0.17860..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e...","[-0.31965876, 0.44649655, -0.21100004, -0.8012..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa...","[0.14436851, 1.6839336, 0.17605224, -0.0226998..."
...,...,...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9,"([also, on, the, property, is, a, contributing...","[0.21325698, -0.16921909, 0.5638019, 0.3699309..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5,"([class, fn, org, fyrde, volda], [evidence-120...","[0.04556023, 0.088313326, 0.269229, 0.1356018,..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9,"([dragon, storm, game, a, game, and, collectib...","[-0.49490845, 0.24079002, 0.40730926, -0.332, ..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46,"([it, states, that, the, zeriuani, which, is, ...","[0.9109429, -1.4237438, -0.68798363, -0.434146..."


In [27]:
df_claims = df[:1228]
df_claims

Unnamed: 0,text,tokens,length,tagged,inferred
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden...","[-0.5255818, 0.3639852, 0.792798, -0.28505632,..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ...","[0.37021023, -0.118777975, 0.11101906, -0.0199..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla...","[0.17473347, -0.37882805, 0.48910916, -0.17860..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e...","[-0.31965876, 0.44649655, -0.21100004, -0.8012..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa...","[0.14436851, 1.6839336, 0.17605224, -0.0226998..."
...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,"[climate, scientists, say, that, aspects, of, ...",20,"([climate, scientists, say, that, aspects, of,...","[-0.4632979, 0.24924023, 0.5984457, -0.5952369..."
claim-243,"in its 5th assessment report in 2013, the ipcc...","[in, its, assessment, report, in, the, ipcc, e...",28,"([in, its, assessment, report, in, the, ipcc, ...","[0.90074253, -0.52594185, -0.10720387, -0.8110..."
claim-2302,"since the mid 1970s, global temperatures have ...","[since, the, mid, global, temperatures, have, ...",14,"([since, the, mid, global, temperatures, have,...","[0.22075339, -0.14185071, 0.20300464, -0.30081..."
claim-502,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru...",36,"([but, abnormal, temperature, spikes, in, febr...","[0.57896537, 0.056216724, 0.09394025, 0.100006..."


In [28]:
def get_top10_rank(inferred_vector):
    similarity_vector = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    return similarity_vector[0:5]

df_claims['sims'] = df_claims['inferred'].apply(lambda x: get_top10_rank(x))
df_claims

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims['sims'] = df_claims['inferred'].apply(lambda x: get_top10_rank(x))


Unnamed: 0,text,tokens,length,tagged,inferred,sims
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden...","[-0.5255818, 0.3639852, 0.792798, -0.28505632,...","[(claim-1937, 0.6857763528823853), (evidence-2..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ...","[0.37021023, -0.118777975, 0.11101906, -0.0199...","[(claim-126, 0.8286309242248535), (evidence-11..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla...","[0.17473347, -0.37882805, 0.48910916, -0.17860...","[(claim-2510, 0.8240987062454224), (claim-2511..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e...","[-0.31965876, 0.44649655, -0.21100004, -0.8012...","[(claim-2021, 0.7865524291992188), (evidence-1..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa...","[0.14436851, 1.6839336, 0.17605224, -0.0226998...","[(claim-2449, 0.9232470989227295), (evidence-1..."
...,...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,"[climate, scientists, say, that, aspects, of, ...",20,"([climate, scientists, say, that, aspects, of,...","[-0.4632979, 0.24924023, 0.5984457, -0.5952369...","[(claim-1504, 0.8438446521759033), (claim-95, ..."
claim-243,"in its 5th assessment report in 2013, the ipcc...","[in, its, assessment, report, in, the, ipcc, e...",28,"([in, its, assessment, report, in, the, ipcc, ...","[0.90074253, -0.52594185, -0.10720387, -0.8110...","[(claim-243, 0.748138427734375), (evidence-359..."
claim-2302,"since the mid 1970s, global temperatures have ...","[since, the, mid, global, temperatures, have, ...",14,"([since, the, mid, global, temperatures, have,...","[0.22075339, -0.14185071, 0.20300464, -0.30081...","[(evidence-1138094, 0.7415162324905396), (evid..."
claim-502,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru...",36,"([but, abnormal, temperature, spikes, in, febr...","[0.57896537, 0.056216724, 0.09394025, 0.100006...","[(claim-502, 0.9076632857322693), (evidence-10..."


In [29]:
# Save the DataFrame to a Pickle file
df_claims.to_pickle('df_claims_Sims.pkl')

In [30]:
# Restore the DataFrame from the Pickle file
df_claims = pd.read_pickle('df_claims_Sims.pkl')
df_claims

Unnamed: 0,text,tokens,length,tagged,inferred,sims
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden...","[-0.5255818, 0.3639852, 0.792798, -0.28505632,...","[(claim-1937, 0.6857763528823853), (evidence-2..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ...","[0.37021023, -0.118777975, 0.11101906, -0.0199...","[(claim-126, 0.8286309242248535), (evidence-11..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla...","[0.17473347, -0.37882805, 0.48910916, -0.17860...","[(claim-2510, 0.8240987062454224), (claim-2511..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e...","[-0.31965876, 0.44649655, -0.21100004, -0.8012...","[(claim-2021, 0.7865524291992188), (evidence-1..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa...","[0.14436851, 1.6839336, 0.17605224, -0.0226998...","[(claim-2449, 0.9232470989227295), (evidence-1..."
...,...,...,...,...,...,...
claim-1504,climate scientists say that aspects of the cas...,"[climate, scientists, say, that, aspects, of, ...",20,"([climate, scientists, say, that, aspects, of,...","[-0.4632979, 0.24924023, 0.5984457, -0.5952369...","[(claim-1504, 0.8438446521759033), (claim-95, ..."
claim-243,"in its 5th assessment report in 2013, the ipcc...","[in, its, assessment, report, in, the, ipcc, e...",28,"([in, its, assessment, report, in, the, ipcc, ...","[0.90074253, -0.52594185, -0.10720387, -0.8110...","[(claim-243, 0.748138427734375), (evidence-359..."
claim-2302,"since the mid 1970s, global temperatures have ...","[since, the, mid, global, temperatures, have, ...",14,"([since, the, mid, global, temperatures, have,...","[0.22075339, -0.14185071, 0.20300464, -0.30081...","[(evidence-1138094, 0.7415162324905396), (evid..."
claim-502,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru...",36,"([but, abnormal, temperature, spikes, in, febr...","[0.57896537, 0.056216724, 0.09394025, 0.100006...","[(claim-502, 0.9076632857322693), (evidence-10..."


In [43]:
total = len(df_claims)
print(total)
correct = 0
score = 0
for index, row in df_claims.iterrows():
    if index == row['sims'][0][0]: 
        score += row['sims'][0][1]
        correct += 1
        for evidence in claims[index]['evidences']:
            if evidence in row['sims']:
                print(row['sims'])
                print(claims[index]['evidences'])

avg = score/correct
print(correct, avg)

1228
1002 0.8461731970310211
