# Zadanie 1: Reprezentacja tekstu przy użyciu metody Bag-of-Words (BoW)

In [5]:
!pip install scikit-learn nltk numpy gensim



In [6]:
from sklearn.datasets import fetch_20newsgroups
corpus = fetch_20newsgroups(subset='test')['data'][:1000]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk

corpus = [doc.replace('\n', '') for doc in corpus]

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)



In [8]:

word_counts = X.toarray().sum(axis=0)

vocab_counts = sorted(zip(vectorizer.get_feature_names_out(), word_counts), key=lambda item: item[1], reverse=True)

vocab_counts = list(map(lambda row: (row[0], int(row[1])), vocab_counts)) # numpy init to int

display(vocab_counts)


[('edu', 1502),
 ('subject', 989),
 ('com', 949),
 ('writes', 741),
 ('article', 612),
 ('don', 498),
 ('like', 480),
 ('just', 479),
 ('posting', 461),
 ('host', 438),
 ('know', 420),
 ('university', 389),
 ('people', 380),
 ('organization', 377),
 ('think', 371),
 ('god', 334),
 ('time', 328),
 ('does', 305),
 ('new', 290),
 ('good', 275),
 ('25', 269),
 ('ve', 258),
 ('use', 248),
 ('ca', 243),
 ('cs', 241),
 ('way', 235),
 ('did', 232),
 ('say', 225),
 ('right', 216),
 ('lines', 213),
 ('10', 201),
 ('make', 198),
 ('jehovah', 195),
 ('really', 194),
 ('want', 189),
 ('said', 185),
 ('used', 185),
 ('problem', 183),
 ('mail', 169),
 ('believe', 165),
 ('game', 165),
 ('need', 163),
 ('question', 161),
 ('lord', 159),
 ('david', 155),
 ('work', 155),
 ('computer', 149),
 ('world', 149),
 ('government', 148),
 ('going', 147),
 ('elohim', 146),
 ('ll', 136),
 ('sure', 136),
 ('state', 135),
 ('thanks', 135),
 ('year', 135),
 ('help', 132),
 ('uk', 130),
 ('let', 129),
 ('information',

# Zadanie 2: Obliczanie TF-IDF dla zbioru dokumentów

**Polecenie:**
- Wczytaj dokumenty z wbudowanego datasetu.
- Przetwórz tekst, wykonując tokenizację oraz usuwanie stop-słów.
- Oblicz macierz TF-IDF dla całego zbioru dokumentów.
- Dla wybranego dokumentu wypisz słowa o najwyższych wartościach TF-IDF.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

corpus = fetch_20newsgroups(subset='train')["data"][:200]
# remove eol
corpus = [c.replace('\n', ' ') for c in corpus]

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(corpus)

In [11]:
X.toarray()[:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
word_index_map = sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])


In [13]:
index_word_map = {v: k for k, v in word_index_map}
display(index_word_map)

{0: '00',
 1: '000',
 2: '0005895485',
 3: '002118',
 4: '002222',
 5: '004253agrgb',
 6: '007',
 7: '009',
 8: '01',
 9: '011112',
 10: '0112',
 11: '011805',
 12: '0164',
 13: '01752',
 14: '0192',
 15: '02',
 16: '020646',
 17: '02194',
 18: '024103',
 19: '0245',
 20: '02p',
 21: '02r4e',
 22: '02tm_',
 23: '03',
 24: '0303',
 25: '0320',
 26: '038',
 27: '04',
 28: '0400',
 29: '040493161915',
 30: '045221',
 31: '045u',
 32: '05',
 33: '051701',
 34: '053',
 35: '0545',
 36: '055341',
 37: '0593',
 38: '05l',
 39: '05lma',
 40: '06',
 41: '060493161931',
 42: '062328',
 43: '06520',
 44: '067',
 45: '069',
 46: '07',
 47: '07059',
 48: '0729',
 49: '0748',
 50: '08',
 51: '081052',
 52: '0826',
 53: '08267',
 54: '083057',
 55: '0831',
 56: '084o',
 57: '08502',
 58: '087',
 59: '0883',
 60: '08v',
 61: '09',
 62: '091',
 63: '092246dlmqc',
 64: '0b',
 65: '0d',
 66: '0d4',
 67: '0df',
 68: '0dfsx',
 69: '0dfvij',
 70: '0dfyl',
 71: '0dgq',
 72: '0dgw83',
 73: '0dum',
 74: '0dy',

In [14]:
import numpy as np

for idx, doc_vec in enumerate(X.toarray()):
    max_val= doc_vec.max()
    top_index = np.where(doc_vec == max_val, True, False)
    keyword = None
    for word, is_top in zip(index_word_map.values(), top_index):
        if is_top:
            keyword = word
            break

    print(f"For doc {idx} keyword is '{keyword}'")

For doc 0 keyword is 'car'
For doc 1 keyword is 'clock'
For doc 2 keyword is '180'
For doc 3 keyword is 'harris'
For doc 4 keyword is 'errors'
For doc 5 keyword is 'weapons'
For doc 6 keyword is 'bmdelane'
For doc 7 keyword is 'scsi'
For doc 8 keyword is 'icons'
For doc 9 keyword is 'board'
For doc 10 keyword is 'irwin'
For doc 11 keyword is 'parent'
For doc 12 keyword is 'cerkoney'
For doc 13 keyword is 'ssf'
For doc 14 keyword is 'purchased'
For doc 15 keyword is 'mathew'
For doc 16 keyword is 'tiff'
For doc 17 keyword is 'insurance'
For doc 18 keyword is 'amplifier'
For doc 19 keyword is 'ncd'
For doc 20 keyword is 'keith'
For doc 21 keyword is 'captain'
For doc 22 keyword is 'catalog'
For doc 23 keyword is 'font'
For doc 24 keyword is 'scsi'
For doc 25 keyword is 'virginia'
For doc 26 keyword is 'plants'
For doc 27 keyword is 'arizona'
For doc 28 keyword is 'god'
For doc 29 keyword is 'centerline'
For doc 30 keyword is 'stereo'
For doc 31 keyword is 'acne'
For doc 32 keyword is 'ex

# Zadanie 3: Implementacja Word Embeddings

**Polecenie:**
- Wczytaj i przetwórz dokumenty z wbudowanego datasetu (tokenizacja, usuwanie stop-słów, normalizacja).
- Wytrenuj model Word Embeddings (np. Word2Vec lub GloVe) na przetworzonym korpusie.
- Przetestuj model, wyszukując najbliższe wektory (sąsiadów) dla wybranego słowa.

In [15]:
from sklearn.datasets import fetch_20newsgroups
from gensim.utils import simple_preprocess

corpus = fetch_20newsgroups(subset='all', categories=['sci.med'])["data"][:200]
corpus = [simple_preprocess(doc) for doc in corpus]

In [16]:
# import Word2Vec model
from gensim.models import Word2Vec

model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4, epochs=100)

In [17]:
word = "cancer"

In [18]:
# vector for the word
model.wv[word]

array([ 1.9205768 , -0.069993  , -2.108913  ,  0.1396455 , -1.0654848 ,
       -0.07268405,  1.9035159 , -0.41705218,  0.38489303,  0.43820995,
        0.07569025,  1.529003  , -2.5023801 ,  2.2566319 ,  3.2160048 ,
        4.937019  ,  1.1917007 ,  3.3615854 ,  0.24990812,  0.49545202,
       -1.9321374 ,  1.4542739 , -1.3641123 , -1.3376168 ,  0.76125234,
       -0.2577113 ,  0.7491279 ,  2.0030568 , -1.5214034 , -0.4884492 ,
       -0.6792328 ,  1.5470508 ,  3.95377   ,  1.5927383 , -2.2451243 ,
       -1.0021521 , -0.5338598 ,  1.9192872 , -1.8333334 , -2.0930777 ,
        0.4598758 , -1.9598219 ,  1.0115949 , -0.55708444,  2.6592615 ,
        0.32076088, -2.238661  ,  0.14578862, -0.39372692,  4.0943255 ,
       -0.16015801, -2.8472273 , -2.915114  ,  1.5258559 , -1.5831922 ,
        0.07129248,  1.9631593 ,  0.2523419 ,  1.6303543 ,  1.0789127 ,
        0.87135094, -0.18055528, -1.9894831 ,  3.0182958 , -3.9345186 ,
        0.31577274, -0.1652365 , -2.0475578 ,  1.4761208 , -1.00

In [19]:
names = [r[0] for r in model.wv.most_similar(word)]
vectors = [model.wv[name] for name in names]

In [20]:
display(names)

['cells',
 'radiation',
 'nicotine',
 'lung',
 'penile',
 'leukemia',
 'ovarian',
 'risk',
 'photos',
 'breast']

# Zadanie 4: Generowanie i analiza Bigramów

**Polecenie:**
- Wczytaj dokumenty z wbudowanego datasetu.
- Wykonaj tokenizację tekstu oraz usuń stop-słowa.
- Wygeneruj bigramy (pary kolejnych słów) z przetworzonego tekstu.
- Wyświetl najczęściej występujące bigramy wraz z ich liczebnością.

In [21]:
import nltk
from nltk.corpus import reuters
from gensim.utils import simple_preprocess

nltk.download("reuters")

raw_text = reuters.raw()
tokens = simple_preprocess(raw_text)

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mstol\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [22]:
import nltk
from nltk.corpus import reuters
from gensim.utils import simple_preprocess

nltk.download("reuters")

raw_text = reuters.raw()
tokens = simple_preprocess(raw_text)

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mstol\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [23]:
bigrams = list(nltk.bigrams(tokens))

In [24]:
bigrams_count = {b: 0 for b in bigrams}

for b in bigrams:
    bigrams_count[b] += 1
    

In [25]:
display(sorted(bigrams_count.items(), key=lambda item: item[1], reverse=True))

[(('in', 'the'), 7093),
 (('of', 'the'), 6913),
 (('said', 'the'), 5355),
 (('mln', 'dlrs'), 4472),
 (('said', 'it'), 4367),
 (('vs', 'mln'), 3945),
 (('mln', 'vs'), 3921),
 (('cts', 'vs'), 3311),
 (('the', 'company'), 3131),
 (('for', 'the'), 2808),
 (('to', 'the'), 2521),
 (('he', 'said'), 2506),
 (('cts', 'net'), 2196),
 (('on', 'the'), 1975),
 (('vs', 'cts'), 1933),
 (('vs', 'loss'), 1783),
 (('dlrs', 'in'), 1761),
 (('billion', 'dlrs'), 1748),
 (('of', 'mln'), 1655),
 (('to', 'be'), 1637),
 (('will', 'be'), 1630),
 (('pct', 'of'), 1615),
 (('it', 'said'), 1599),
 (('and', 'the'), 1593),
 (('net', 'vs'), 1579),
 (('by', 'the'), 1559),
 (('revs', 'mln'), 1548),
 (('with', 'the'), 1469),
 (('shr', 'cts'), 1469),
 (('inc', 'lt'), 1384),
 (('that', 'the'), 1376),
 (('pct', 'in'), 1360),
 (('last', 'year'), 1357),
 (('from', 'the'), 1354),
 (('at', 'the'), 1329),
 (('company', 'said'), 1261),
 (('dlrs', 'vs'), 1237),
 (('to', 'mln'), 1144),
 (('of', 'its'), 1141),
 (('corp', 'lt'), 1116

# Zadanie 5: Analiza Trigramów w tekście

**Polecenie:**
- Wczytaj dokumenty z wbudowanego datasetu.
- Przetwórz tekst, wykonując tokenizację oraz usuwanie stop-słów.
- Wygeneruj trigramy (sekwencje trzech kolejnych słów) z dokumentów.
- Wypisz najczęściej występujące trigramy w analizowanym zbiorze.

In [26]:
import nltk
from nltk.corpus import reuters
from gensim.utils import simple_preprocess

nltk.download("reuters")

raw_text = reuters.raw()
tokens = simple_preprocess(raw_text)

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mstol\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [27]:
trigrams = list(nltk.trigrams(tokens))

In [28]:
trigrams_count = {t: 0 for t in trigrams}

for t in trigrams:
    trigrams_count[t] += 1

In [29]:
display(sorted(trigrams_count.items(), key=lambda item: item[1], reverse=True))

[(('mln', 'vs', 'mln'), 3402),
 (('cts', 'vs', 'cts'), 1779),
 (('revs', 'mln', 'vs'), 1515),
 (('shr', 'cts', 'vs'), 1446),
 (('the', 'company', 'said'), 1181),
 (('vs', 'cts', 'net'), 1169),
 (('cts', 'net', 'vs'), 1082),
 (('of', 'mln', 'dlrs'), 1062),
 (('net', 'vs', 'revs'), 887),
 (('mln', 'dlrs', 'in'), 827),
 (('vs', 'mln', 'note'), 742),
 (('vs', 'revs', 'mln'), 709),
 (('cts', 'vs', 'loss'), 665),
 (('net', 'shr', 'cts'), 657),
 (('net', 'loss', 'vs'), 645),
 (('said', 'it', 'has'), 643),
 (('dlrs', 'vs', 'dlrs'), 635),
 (('vs', 'mln', 'avg'), 632),
 (('mln', 'avg', 'shrs'), 621),
 (('pct', 'of', 'the'), 611),
 (('avg', 'shrs', 'vs'), 604),
 (('the', 'united', 'states'), 603),
 (('inc', 'said', 'it'), 587),
 (('net', 'mln', 'vs'), 580),
 (('qtr', 'net', 'shr'), 574),
 (('he', 'said', 'the'), 539),
 (('shr', 'dlrs', 'vs'), 533),
 (('sales', 'mln', 'vs'), 517),
 (('cts', 'net', 'loss'), 517),
 (('vs', 'loss', 'revs'), 505),
 (('the', 'end', 'of'), 502),
 (('billion', 'vs', 'bil