## Libraries

pip install gensim

pip install nltk

pip install pandas

In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import collections


## Loading Data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

In [3]:
# Lowercasing the 'claim_text' field for each claim
for claim_id, claim_info in claims.items():
    claim_info['claim_text'] = claim_info['claim_text'].lower()

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)


In [5]:
evidences = {i: str.lower(j) for i,j in evidences.items()}

### Prepare the corpus

In [6]:
# Collect all texts from claims
corpus = {}
for id, claim in claims.items():
    corpus[id] = str.strip(claim['claim_text'])  # Add claim text

for id, evidence in evidences.items():
    corpus[id] = str.strip(evidence) # Add evidence text

In [7]:
def tokenize_text(df):
    df['tokens'] = df['text'].apply(lambda x: [token for token in word_tokenize(x) if token.isalpha()])
    df["length"] = df.tokens.apply(len)
    return df

In [8]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text'])

In [9]:
df = tokenize_text(df)
df

Unnamed: 0,text,tokens,length
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46


In [10]:
# Define a function to be applied to each row
def process_row(row, index):
    return TaggedDocument(row['tokens'], tags=[index])

df['tagged'] = df.apply(lambda row: process_row(row, row.name), axis=1)
df

Unnamed: 0,text,tokens,length,tagged
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"([not, only, is, there, no, scientific, eviden..."
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"([el, niño, drove, record, highs, in, global, ..."
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"([in, pdo, switched, to, a, cool, phase], [cla..."
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"([weather, channel, john, coleman, provided, e..."
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"([january, capped, a, month, period, of, globa..."
...,...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9,"([also, on, the, property, is, a, contributing..."
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5,"([class, fn, org, fyrde, volda], [evidence-120..."
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9,"([dragon, storm, game, a, game, and, collectib..."
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46,"([it, states, that, the, zeriuani, which, is, ..."


In [11]:
train_corpus = df.tagged.values

## Train Model
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [12]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [13]:
model.build_vocab(train_corpus)

In [14]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [15]:
model.save("Doc2Vec.model")

## Assesing the model

In [30]:
inferred_vectors = {}

for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    inferred_vectors[doc_id] = sims[0:5]

KeyboardInterrupt: 

In [None]:
def get_top10_rank(doc_id):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    similarity_vector = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    return similarity_vector[0:5]

In [None]:
df.info

In [None]:
# Save DataFrame as a Pickle file
df.to_pickle('dfWord2Vec.pkl')

In [None]:
# Restore the DataFrame from the Pickle file
df_restored = pd.read_pickle('dfWord2Vec.pkl')

# Verify the restored DataFrame
print(df_restored)

In [16]:
#existing_document_vector = model.dv['doc1']



df['infered'] = df.apply(lambda row: process_row(row, row.name), axis=1)
df['infered'] = df['text'].apply(lambda x: [token for token in word_tokenize(x) if token.isalpha()])
df['infered'] = df['text'].apply(lambda x: get_top10_rank(x))
df.info

In [18]:
inferred_vectors = {}
sims = {}

for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    inferred_vectors[doc_id] = inferred_vector
    #sim = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    #sims[doc_id] = sim
    

In [None]:
for key, values in inferred_vectors:
    print(key, values)

In [None]:
ranks = []
second_ranks = []

rank = [docid for docid, sim in sims].index(doc_id)
ranks.append(rank)

second_ranks.append(sims[1])

In [None]:
counter = collections.Counter(ranks)
print(counter)

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))