In [48]:
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from src.preprocessing.prepro import TextPrep
import json
import pandas as pd

In [4]:
with open("data/data_preprocessed.json", 'r') as j:
    data = json.load(j)

data[0]


{'title': 'Pandemic',
 'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (S

In [13]:
# joining a list of tokens into a sequence of tokens
data_text = [" ".join(wikipedia_text["tokenized_text"]) for wikipedia_text in data]

In [12]:
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(data_text)

[[0.         0.         0.         ... 0.         0.11171086 0.11171086]
 [0.04587622 0.03728526 0.03728526 ... 0.         0.         0.        ]
 [0.04427782 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.0251485  0.         0.         ... 0.         0.         0.        ]]


In [66]:
features = vectorizer.get_feature_names_out()
features_json = {"features": features.tolist()}

with open('data/features.json', 'w') as f:
    json.dump(features_json, f)

In [31]:
tfidf_tokens = model.todense()

for i, wikipedia_text in enumerate(data):
    wikipedia_text['tfidf_vec'] = tfidf_tokens[i].tolist()[0]

data

[{'title': 'Pandemic',
  'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 

In [34]:
print(len(data[0]["tfidf_vec"]))
print(len(data[1]['tfidf_vec']))

1447
1447


In [53]:
query = "Find a text about covid. that explain the mortality rate"
text_prep = TextPrep(query)
text_prep.lowercasing()
text_prep.get_lemmas()
final_text_query = " ".join(text_prep.final_tokens)

result = vectorizer.transform([final_text_query])
tfidf_query = result.todense().tolist()[0]
len(tfidf_query)

1447

In [61]:
# finding cosine similarity
for wikipedia in data:
    wikipedia["cosine_similarity"] = float(
        cosine_similarity([wikipedia["tfidf_vec"]], [tfidf_query])[0][0]
    )

cosine = []
text = []
for wikipedia in data:
    cosine.append(wikipedia["cosine_similarity"])
    text.append(wikipedia["title"])

df = pd.DataFrame({"text": text, "cosine": cosine})
df


Unnamed: 0,text,cosine
0,Pandemic,0.034571
1,Epidemiology of HIV/AIDS,0.041698
2,Antonine Plague,0.0
3,Basic reproduction number,0.025662
4,Bills of mortality,0.06501
5,Cholera,0.054144
6,COVID-19 pandemic,0.101659
7,Crimson Contagion,0.036908
8,Disease X,0.036907
9,Event 201,0.0


In [62]:
df.sort_values(["cosine"], ascending=False)

Unnamed: 0,text,cosine
13,Pandemic Severity Assessment Framework,0.216381
19,Spanish flu,0.167053
6,COVID-19 pandemic,0.101659
4,Bills of mortality,0.06501
5,Cholera,0.054144
18,Science diplomacy and pandemics,0.052347
1,Epidemiology of HIV/AIDS,0.041698
7,Crimson Contagion,0.036908
8,Disease X,0.036907
17,1929–1930 psittacosis pandemic,0.035021


In [63]:
# saving the json
with open('data/data_tfidf.json', 'w') as f:
    json.dump(data, f)