In [3]:
import numpy as np
import pandas as pd
from collections import Counter
import re

from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px

Príprava dát

In [4]:
data_90 = pd.read_csv("Data/1970_90_keywords.csv", sep = '\t' ,on_bad_lines='skip') #vycistene, zlematizovane data z rokov 1970_90 (aj z klucovymi slovami)
data_22 = pd.read_csv("Data/2007_22_keywords.csv", sep = '\t' ,on_bad_lines='skip')#vycistene, zlematizovane data z rokov 2007_22 (aj z klucovymi slovami)
data_historical = pd.read_csv("Data/data_1930_1990.csv", sep = '\t' ,header = None, names = ["id","year","title"], on_bad_lines='skip') #nevycistene data z rokov 1930-, nezlematizovane (cistia sa v kode nizsie)

In [5]:
stop_words_df = pd.read_csv('data/combined_stopwords.txt', header=None, names=["sw"]) #subor stopwords, kde sa nachadzaju "nase" stopwords spolu s anglickymi
stop_words = stop_words_df["sw"].tolist()

In [6]:
data_22["lemma_without_stopwords"] = data_22["lemma_without_stopwords"].astype(str)
data_22["tfidf_keywords"] = data_22["tfidf_keywords"].astype(str)

In [7]:
data_90["lemma_without_stopwords"] = data_22["lemma_without_stopwords"].astype(str)
data_90["tfidf_keywords"] = data_22["tfidf_keywords"].astype(str)

In [8]:
def get_words_by_frequency(df, stop_words,column):
    '''
    Parameters:
        df: a dataframe containing column 'column'
        stopwords: a set of words to skip
    Returns:
        a dataframe with word and its frequency in the data
    '''
    titles = df[column]
    titles = np.array(pd.Series(titles).dropna())
    vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))
    X = vectorizer.fit_transform(titles)

    vocabulary = vectorizer.get_feature_names_out()
    occs = np.array(X.sum(axis=0))[0,:]
    ind  = np.argsort(occs)
    top_n_words = pd.DataFrame([{'word': vocabulary[a], 'freq': occs[a]} for a in ind])
    return top_n_words

In [9]:
def get_word_freqs(data,column, word=None):
    '''
    Parameters:
        data: a dataframe of original loaded csv data
        word: word to filter out the data
    Returns:
        a dataframe with word and its frequency, normalized frequency
    '''
    filtered_data = data.copy()
    if word is not None:
        words = '|'.join(word)
        filtered_data = data[data[column].str.contains(words, regex=True)]
    data_freqs = filtered_data.groupby('year').apply(get_words_by_frequency, stop_words,column).reset_index().drop(columns=['level_1'])
    count_per_year = filtered_data.groupby('year').size().to_dict()
    data_freqs['year_count'] = data_freqs['year'].apply(lambda x: count_per_year[x])
    data_freqs['norm_freq'] = data_freqs['freq'] / data_freqs['year_count'] * 100
    return data_freqs


In [10]:
def word_trend(data: pd.DataFrame, words: list):
    '''
    Parameters:
        data: a dataframe of csv data (with normalized frequency)
        words: sequence of words to filter out the data
    Returns:
        a dataframe with word and its frequency, normalized frequency
    '''

    df = data.query(f"`word` in {words}")
    fig = px.line(df, x="year", y="norm_freq", color='word')
    fig.show()

In [11]:
Counter(" ".join(data_22["lemma_without_stopwords"]).split()).most_common(100) #100 najcastejsich slov vyskytujucich sa v dataframe data_22 v stlpci lemma_without_stopwords

[('cell', 122608),
 ('cancer', 83374),
 ('base', 66462),
 ('2', 52097),
 ('induce', 49802),
 ('1', 48865),
 ('s', 44672),
 ('protein', 42658),
 ('associate', 40224),
 ('gene', 37587),
 ('therapy', 35998),
 ('child', 34700),
 ('care', 34402),
 ('outcome', 33414),
 ('3', 26825),
 ('mouse', 26555),
 ('acid', 26340),
 ('adult', 26278),
 ('receptor', 26152),
 ('infection', 25783),
 ('drug', 24451),
 ('rat', 24446),
 ('relate', 23862),
 ('change', 23771),
 ('chronic', 23627),
 ('tumor', 23299),
 ('disorder', 22206),
 ('surgery', 22075),
 ('year', 21785),
 ('follow', 21772),
 ('target', 21685),
 ('level', 21651),
 ('carcinoma', 21280),
 ('signal', 21013),
 ('19', 20878),
 ('molecular', 20837),
 ('injury', 20486),
 ('brain', 20406),
 ('covid', 20304),
 ('stress', 20113),
 ('breast', 20025),
 ('improve', 19897),
 ('growth', 19495),
 ('lung', 19152),
 ('woman', 19005),
 ('virus', 18900),
 ('mediate', 18809),
 ('c', 18689),
 ('synthesis', 18651),
 ('enhance', 18131),
 ('blood', 17986),
 ('mechani

In [12]:
Counter(" ".join(data_22["tfidf_keywords"]).split()).most_common(100) #100 najcastejsich slov vyskytujucich sa v dataframe data_22 v stlpci tfidf_keywords

[('cell', 17980),
 ('cancer', 16270),
 ('care', 9029),
 ('child', 7719),
 ('protein', 7667),
 ('nan', 7520),
 ('therapy', 7478),
 ('base', 7294),
 ('gene', 7080),
 ('drug', 6193),
 ('breast', 6092),
 ('disorder', 5980),
 ('19', 5821),
 ('pain', 5594),
 ('infection', 5485),
 ('literature', 5484),
 ('surgery', 5471),
 ('associate', 5311),
 ('tumor', 5225),
 ('lung', 5187),
 ('brain', 5173),
 ('therapeutic', 5087),
 ('outcome', 5070),
 ('target', 5026),
 ('genetic', 4998),
 ('induce', 4988),
 ('liver', 4987),
 ('kidney', 4944),
 ('chronic', 4924),
 ('molecular', 4780),
 ('medicine', 4775),
 ('mechanism', 4758),
 ('cardiovascular', 4749),
 ('life', 4733),
 ('synthesis', 4699),
 ('pediatric', 4658),
 ('pulmonary', 4585),
 ('receptor', 4556),
 ('perspective', 4552),
 ('heart', 4505),
 ('hepatitis', 4495),
 ('regulation', 4447),
 ('challenge', 4413),
 ('diabete', 4386),
 ('failure', 4370),
 ('pregnancy', 4365),
 ('mutation', 4347),
 ('renal', 4311),
 ('practice', 4304),
 ('coronary', 4274),
 

**Choroby**

Porovnanie rôznych chorôb

In [13]:
#nazvy roznych chôrob
diseases = ["chickenpox","hiv","influenza","mononucleosis","aids", "asthma","malaria","measles","migraine",
            "tetanus","rabies","covid"]

In [14]:
diseases_occurence = data_22.lemma_without_stopwords.str.findall("|".join(diseases)).explode().value_counts() #ako casto sa jednotlive choroby vyskytuju v nazvoch clankov 2007_2022
diseases_occurence_dict_lemma = diseases_occurence.to_dict()
print("obdobie 2007-2020: ",diseases_occurence_dict_lemma)
#for word in diseases:
#    print(word, ": ", diseases_occurence_dict_lemma.get(word, "0"))

diseases_occurence = data_90.lemma_without_stopwords.str.findall("|".join(diseases)).explode().value_counts() #ako casto sa jednotlive choroby vyskytuju v nazvoch clankov 1970-90
diseases_occurence_dict_lemma = diseases_occurence.to_dict()
print("obdobie 1970-1990: ",diseases_occurence_dict_lemma)

obdobie 2007-2020:  {'covid': 20446, 'hiv': 14232, 'influenza': 5286, 'asthma': 4966, 'malaria': 3176, 'migraine': 1428, 'aids': 403, 'tetanus': 203, 'measles': 73, 'mononucleosis': 55, 'chickenpox': 29, 'rabies': 6}
obdobie 1970-1990:  {'hiv': 10778, 'influenza': 4060, 'asthma': 3550, 'malaria': 2371, 'migraine': 976, 'aids': 345, 'tetanus': 157, 'measles': 48, 'mononucleosis': 39, 'chickenpox': 23, 'covid': 4, 'rabies': 2}


In [15]:
data_freqs = get_word_freqs(data_90,"lemma_without_stopwords")  #frekvencia vyskytu jednotlivych chorob za roky 1970-90
word_trend(data_freqs, diseases)

In [16]:
data_freqs = get_word_freqs(data_22,"lemma_without_stopwords")   #frekvencia vyskytu jednotlivych chorob za roky 2007-2022
word_trend(data_freqs, diseases)

HIV

In [17]:
hiv_table_90 = data_90[data_90["lemma_without_stopwords"].str.contains('hiv')] #dataframe 1970-90 obsahujuci vsetky riadky kde sa v stlpci lemma_without_stopwords nachadza slovo "hiv"
hiv_table = data_22[data_22["lemma_without_stopwords"].str.contains('hiv')] #dataframe 2007_22 obsahujuci vsetky riadky kde sa v stlpci lemma_without_stopwords nachadza slovo "hiv"

In [18]:
hiv_table_k = data_22[data_22["tfidf_keywords"].str.contains('hiv')]
hiv_table_k_90 = data_90[data_90["tfidf_keywords"].str.contains('hiv')]

In [19]:
print(hiv_table_90.shape)
print(hiv_table_k_90.shape)
print(hiv_table.shape)
print(hiv_table_k.shape)

(9900, 7)
(3241, 7)
(13060, 8)
(4459, 8)


In [20]:
Counter(" ".join(hiv_table["lemma_without_stopwords"]).split()).most_common(100) #100 najčastejsich slov v clankoch kde sa vyskytuje slovo hiv 2007_2022

[('hiv', 13733),
 ('1', 2947),
 ('infect', 1761),
 ('infection', 1730),
 ('man', 1098),
 ('antiretroviral', 1045),
 ('therapy', 987),
 ('cell', 944),
 ('live', 861),
 ('woman', 793),
 ('associate', 778),
 ('care', 735),
 ('aid', 714),
 ('drug', 693),
 ('positive', 670),
 ('sex', 646),
 ('people', 644),
 ('prevention', 643),
 ('child', 560),
 ('base', 553),
 ('testing', 552),
 ('t', 494),
 ('africa', 474),
 ('virus', 473),
 ('prevalence', 462),
 ('adult', 434),
 ('south', 429),
 ('cohort', 410),
 ('c', 404),
 ('inhibitor', 401),
 ('viral', 398),
 ('transmission', 397),
 ('individual', 383),
 ('hepatitis', 372),
 ('resistance', 365),
 ('sexual', 363),
 ('cd4', 342),
 ('intervention', 338),
 ('2', 334),
 ('test', 334),
 ('s', 330),
 ('tuberculosis', 328),
 ('vaccine', 310),
 ('anti', 307),
 ('b', 305),
 ('protein', 285),
 ('relate', 284),
 ('service', 274),
 ('outcome', 269),
 ('immune', 256),
 ('african', 255),
 ('exposure', 253),
 ('hcv', 253),
 ('china', 251),
 ('prophylaxis', 243),
 (

In [21]:
Counter(" ".join(hiv_table_90["lemma_without_stopwords"]).split()).most_common(100) #100 najčastejsich slov v clankoch kde sa vyskytuje slovo hiv 1970-90

[('hiv', 10444),
 ('1', 2411),
 ('infect', 1509),
 ('infection', 1353),
 ('antiretroviral', 810),
 ('man', 785),
 ('therapy', 754),
 ('cell', 738),
 ('aid', 579),
 ('woman', 569),
 ('drug', 563),
 ('associate', 553),
 ('positive', 529),
 ('care', 492),
 ('prevention', 492),
 ('sex', 478),
 ('child', 456),
 ('base', 403),
 ('t', 388),
 ('virus', 384),
 ('live', 380),
 ('testing', 375),
 ('prevalence', 369),
 ('africa', 339),
 ('transmission', 331),
 ('inhibitor', 329),
 ('c', 326),
 ('south', 310),
 ('individual', 295),
 ('viral', 295),
 ('hepatitis', 291),
 ('resistance', 289),
 ('adult', 284),
 ('people', 280),
 ('cd4', 276),
 ('cohort', 275),
 ('tuberculosis', 260),
 ('2', 257),
 ('sexual', 256),
 ('test', 254),
 ('vaccine', 252),
 ('anti', 251),
 ('b', 251),
 ('s', 242),
 ('protein', 240),
 ('intervention', 233),
 ('hcv', 220),
 ('relate', 213),
 ('aids', 205),
 ('increase', 204),
 ('african', 200),
 ('service', 190),
 ('immune', 187),
 ('behavior', 187),
 ('antibody', 184),
 ('outc

In [22]:
#pre roky 2007-2022
print(hiv_table.groupby(["year"]).size()/data_22.groupby(["year"]).size()) #klesa pocet clankov o HIV -> HIV_pocet_clankov_za_jednotlive_roky /celkovy_pocet_clankov_za_jednotlive_roky
print()
print(hiv_table.groupby(["year"]).size()) #celkovy pocet hiv clankov -> je ich sice viac ale mame aj viac clankov z roku 2022 ako 2007 (narastaju tiez postupne)

year
2007    0.009919
2008    0.009699
2009    0.009544
2010    0.009820
2011    0.009228
2012    0.009788
2013    0.009122
2014    0.008263
2015    0.008438
2016    0.007886
2017    0.008138
2018    0.007597
2019    0.006902
2020    0.006822
2021    0.006031
2022    0.005723
dtype: float64

year
2007    655
2008    679
2009    699
2010    759
2011    763
2012    874
2013    863
2014    819
2015    866
2016    831
2017    871
2018    850
2019    814
2020    921
2021    871
2022    925
dtype: int64


Antibiotiká

In [23]:
#vycistenie dat 1930-90

data_historical["title"] = data_historical["title"].astype(str)
data_historical['year'] = data_historical['year'].astype('Int64')

#preprocessing text
def clean_text(text):
  text = re.sub(r'[^A-Za-z0-9 ]+',' ', text) #odstranime vsetko okrem pismen, cislic a medzier
  return text
data_historical["title_cleaned"] = data_historical["title"].str.lower() #zmenime text na male pismena
data_historical['title_cleaned'] = data_historical['title_cleaned'].apply(clean_text) #vycistime

In [24]:
data_freqs = get_word_freqs(data_historical,"title_cleaned") #frekvencia vsetkych slov 
word_trend(data_freqs, ['antibiotics']) #frekvencia pre slovo antibiotics 

In [25]:
#transform - short format - pre wordtrend 
antibiotics_data_freqs = get_word_freqs(data_historical,"title_cleaned", 'antibiotics')
short_format = antibiotics_data_freqs.pivot(index='word', columns='year', values='norm_freq')
short_format['count'] = short_format.count(axis='columns')
short_format.sort_values(['count', 1970], ascending=False, inplace=True)
short_format.head()

year,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,...,1982,1983,1984,1985,1986,1987,1988,1989,1990,count
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acid,0.750044,0.820257,1.238281,1.034543,1.231044,1.130042,1.65575,1.234568,1.293837,1.342519,...,1.911068,1.785531,1.796801,1.745771,1.703515,1.663755,1.621449,1.613188,1.671674,61
rat,0.261643,0.303138,0.283036,0.333158,0.338983,0.417246,0.363901,0.608071,0.476677,0.600601,...,3.251684,3.224663,3.283511,3.235654,3.279157,3.197743,3.114942,3.108994,3.158437,61
blood,1.971045,1.71184,2.299664,1.858671,1.855486,1.68637,2.201601,2.063755,1.940756,2.20809,...,1.973665,2.003936,1.958604,1.965104,1.811595,1.888466,1.850589,1.854473,1.79148,61
cells,0.226757,0.249643,0.265346,0.4559,0.392507,0.399861,0.400291,0.331675,0.476677,0.3003,...,3.332538,3.357975,3.426257,3.447413,3.387237,3.47067,3.492331,3.645093,3.664875,61
children,0.732601,0.85592,0.849107,0.578643,0.856378,0.643255,0.891557,1.3267,0.902281,0.565271,...,1.665151,1.6391,1.703254,1.656523,1.570301,1.647088,1.559444,1.60395,1.638588,61


In [26]:
#graf rôznych antibiotik v priebehu rokov
antib = ['salvarsan','sulfonamide','salicylate','sulfone','penicillin','bacitracin','polypeptide','amphenicol','tracycline','aminoglycoside''polymyxin','nitrofuran','mupirocin','monobactam','enniatin','amoxicillin','doxycycline','cephalexin','ciprofloxacin','clindamycin','metronidazole','azithromycin','sulfamethoxazole','trimethopri']
word_trend(data_freqs, antib)

Vakcíny

In [27]:
#transform - short format - pre wordtrend 
vaccine_data_freqs = get_word_freqs(data_historical,"title_cleaned", ['vaccine', 'vaccines'])
short_format = vaccine_data_freqs.pivot(index='word', columns='year', values='norm_freq')
short_format['count'] = short_format.count(axis='columns')
short_format.sort_values([1970, 'count'], ascending=False, inplace=True)

In [28]:
#graf rôznych vakcín v priebehu rokov
word_trend(vaccine_data_freqs.query("`year` > 1947"), ['rubella', 'measles', 'influenza', 'pertussis', 'smallpox', 'chollera', 'rabies', 'aids'])

In [29]:
#graf rôznych chorôb v priebehu rokov
word_trend(data_freqs.query("`year` > 1947"),['rubella', 'measles', 'influenza', 'pertussis', 'smallpox', 'chollera', 'rabies', 'polio', 'aids'])# influenza 1957-1958 -> Asian flu