In [31]:
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#dataset read in 
with open('dataset/evidence.json') as json_file:
    evidence = json.load(json_file)

with open('dataset/dev-claims.json') as json_file:
    dev_claim = json.load(json_file)

with open('dataset/train-claims.json') as json_file:
    train = json.load(json_file)

with open('dataset/test-claims-unlabelled.json') as json_file:
    test = json.load(json_file)

In [19]:
stop_words = set(stopwords.words('english'))

claim_evi = []
for key in train.keys():
    claim_evi += train[key]['evidences']
evi_text = []
for key in claim_evi:
    for i in [w for w in word_tokenize(evidence[key]) if (not w.lower()  in stop_words) and w.isalpha()]:
        evi_text.append(i)

fre_dict = {}
for word in evi_text:
    if word in fre_dict.keys():
        fre_dict[word] += 1
    else:
        fre_dict[word] = 1

In [27]:
# find top frequent words, top is a hyper parameter that needs to be tunned
top = 15
top_tier = [k for k, v in sorted(fre_dict.items(), key=lambda item: item[1],reverse=True)][:top]
print(top_tier)

['global', 'warming', 'climate', 'temperature', 'greenhouse', 'ice', 'change', 'sea', 'years', 'carbon', 'temperatures', 'emissions', 'surface', 'gases', 'Earth']


In [28]:
related_key = {}

for key in evidence.keys():
    words = [w for w in word_tokenize(evidence[key]) if (not w.lower() in stop_words) and w.isalpha()]
    counter = 0
    for word in top_tier:
        if word in words:
            counter += 1
    if counter != 0:
        related_key[key] = counter
    

#print(len(related_key.keys()))

46681


In [41]:
text = [evidence[string] for string in related_key.keys()]
for key in train.keys():
    text.append(train[key]['claim_text'])
#print(text)
evi = []
full_id = []
for key in related_key.keys():
    evi.append(evidence[key])
    full_id.append(key)

tfidf = TfidfVectorizer()
tfidf.fit(text)
full_evi =  tfidf.transform(evi)
#print(full_evi[7])

  (0, 52828)	0.05761776610002657
  (0, 49697)	0.21091246956592405
  (0, 49153)	0.23155809095866786
  (0, 47304)	0.1611285093627065
  (0, 38028)	0.07165028743637147
  (0, 35323)	0.23186320904761423
  (0, 33217)	0.2147851040127824
  (0, 27889)	0.10793540406729296
  (0, 26759)	0.07756307574220407
  (0, 22194)	0.22139292400277844
  (0, 20035)	0.20997642342199915
  (0, 10858)	0.46715928292403086
  (0, 10855)	0.434730663924897
  (0, 7049)	0.49958790192316466


In [42]:
#doing cosine similarity checking again with new dataset, ave is hyper-parameter

test_claim = {}
ave = 3

for key in test.keys():
    claim = test[key]['claim_text']
    v_claim = tfidf.transform([claim])
    similarity = cosine_similarity(v_claim, full_evi)[0]
    evi_dict = {}
    for i in range(len(similarity)):
        evi_dict[full_id[i]] = similarity[i]
    
     
    s_sim = [(k, v) for k, v in sorted(evi_dict.items(), key=lambda item: item[1],reverse=True)][:ave]
    test_claim[key] = [k for k,v in s_sim]

print(test_claim)

{'claim-2967': ['evidence-19067', 'evidence-664218', 'evidence-1123443'], 'claim-979': ['evidence-268048', 'evidence-421870', 'evidence-452632'], 'claim-1609': ['evidence-171951', 'evidence-790637', 'evidence-766313'], 'claim-1020': ['evidence-757578', 'evidence-1084855', 'evidence-334451'], 'claim-2599': ['evidence-349241', 'evidence-915109', 'evidence-332154'], 'claim-2110': ['evidence-72690', 'evidence-516386', 'evidence-506409'], 'claim-1135': ['evidence-1183711', 'evidence-49223', 'evidence-167981'], 'claim-712': ['evidence-756019', 'evidence-309811', 'evidence-758556'], 'claim-1307': ['evidence-1183711', 'evidence-127311', 'evidence-1086865'], 'claim-148': ['evidence-37274', 'evidence-1091739', 'evidence-669368'], 'claim-903': ['evidence-295594', 'evidence-75555', 'evidence-90562'], 'claim-2942': ['evidence-746701', 'evidence-672028', 'evidence-1146102'], 'claim-1001': ['evidence-496586', 'evidence-277435', 'evidence-368618'], 'claim-1034': ['evidence-493616', 'evidence-345021', 

In [None]:
out = {}
for key in test_claim.keys():
    out[key] = {}
    out[key]["claim_text"] = test[key]["claim_text"]
    out[key]['claim_label'] = []
    out[key]['evidences'] = test_claim[key]


file_path = 'evi_f.json'
with open(file_path, 'w') as json_file:
    json.dump(out, json_file)