# 1. n-Gram language models

In [None]:
import matplotlib.pyplot as plt
plt.gca().clear()

In [13]:
def fix_unicode(text: str) -> str:
    return text.replace(u"\u00E2", "'")

import re
from bs4 import BeautifulSoup
import requests

url = "https://www.oreilly.com/ideas/what-is-data-science"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

content = soup.find("div", "main-post-radar-content")   # find article-body div
regex = r"[\w']+|[\.]"                       # matches a word or a period

document = []

for paragraph in content("p"):
    words = re.findall(regex, fix_unicode(paragraph.text))
    document.extend(words)
document

["We'",
 've',
 'all',
 'heard',
 'it',
 'according',
 'to',
 'Hal',
 'Varian',
 'statistics',
 'is',
 'the',
 'next',
 'sexy',
 'job',
 '.',
 'Five',
 'years',
 'ago',
 'in',
 'What',
 'is',
 'Web',
 '2',
 '.',
 '0',
 'Tim',
 "O'",
 'Reilly',
 'said',
 'that',
 "'",
 'data',
 'is',
 'the',
 'next',
 'Intel',
 'Inside',
 '.',
 "'",
 'But',
 'what',
 'does',
 'that',
 'statement',
 'mean',
 'Why',
 'do',
 'we',
 'suddenly',
 'care',
 'about',
 'statistics',
 'and',
 'about',
 'data',
 'In',
 'this',
 'post',
 'I',
 'examine',
 'the',
 'many',
 'sides',
 'of',
 'data',
 'science',
 "'",
 'the',
 'technologies',
 'the',
 'companies',
 'and',
 'the',
 'unique',
 'skill',
 'sets',
 '.',
 'Join',
 'the',
 "O'Reilly",
 'online',
 'learning',
 'platform',
 '.',
 'Get',
 'a',
 'free',
 'trial',
 'today',
 'and',
 'find',
 'answers',
 'on',
 'the',
 'fly',
 'or',
 'master',
 'something',
 'new',
 'and',
 'useful',
 '.',
 'The',
 'web',
 'is',
 'full',
 'of',
 "'",
 'data',
 'driven',
 'apps',
 '

`\w` is any alphanumeric character (equivalent to [a-zA-Z0-9_])

`+` matches the previous token between one and unlimited times, as many times as possible, giving back as needed (greedy)

`|` is or, especially with the `[]` on both sides. It defines them as a subpattern.

`\.` is period (`.` is any character)

See https://regex101.com/, https://regexr.com/ and https://regexone.com/lesson/introduction_abcs

In [16]:
' '.join(document)

"We' ve all heard it according to Hal Varian statistics is the next sexy job . Five years ago in What is Web 2 . 0 Tim O' Reilly said that ' data is the next Intel Inside . ' But what does that statement mean Why do we suddenly care about statistics and about data In this post I examine the many sides of data science ' the technologies the companies and the unique skill sets . Join the O'Reilly online learning platform . Get a free trial today and find answers on the fly or master something new and useful . The web is full of ' data driven apps . ' Almost any e commerce application is a data driven application . There' s a database behind a web front end and middleware that talks to a number of other databases and data services credit card processing companies banks and so on . But merely using data isn' t really what we mean by ' data science . ' A data application acquires its value from the data itself and creates more data as a result . It' s not just an application with data it' s

In [20]:
from collections import defaultdict
import random

transitions = defaultdict(list)
for prev, current in zip(document, document[1:]):
    transitions[prev].append(current)

def generate_using_bigrams() -> str:
    current = "."   # this means the next word will start a sentence
    result = []
    while True:
        next_word_candidates = transitions[current]    # bigrams (current, _)
        current = random.choice(next_word_candidates)  # choose one at random
        result.append(current)                         # append it to results
        if current == ".": return " ".join(result)     # if "." we're done

In [21]:
generate_using_bigrams()

"It is significant or the number of a programming task but different datasets using Google popularized the next week' s an application lets developers and tune the low 1 The web applications to say nothing of successful retail business ."

- that's it. Rerun generate to produce more nonsense.
- Note that transitions is actually a dict of all possible consecutive elements:

In [18]:
transitions

defaultdict(list,
            {"We'": ['ve', 're', 've', 're', 've'],
             've': ['all',
              'ever',
              'taken',
              'made',
              'seen',
              'ever',
              'parsed',
              'all',
              'heard',
              'analyzed',
              'seen',
              'collected',
              'all',
              'gotten',
              'just'],
             'all': ['heard',
              'the',
              'of',
              'of',
              'carefully',
              'the',
              'in',
              'equipment',
              "you'",
              'heard',
              'data',
              'of',
              'heard',
              'locked',
              'at',
              'trying',
              'aspects',
              'tapped',
              'The'],
             'heard': ['it', 'a', "'", 'the'],
             'it': ['according',
              'to',
              'does',
              'to',
    

In [32]:
len(transitions)

1518

In [34]:
# to make trigrams, the transitions need to be longer
# the starts also need to be tracked seperately but I'll leave that for later

trigram_transitions = defaultdict(list)
starts = []
for prev, current, next in zip(document, document[1:], document[2:]):
    if prev == '.':
        starts.append(current)
    trigram_transitions[(prev, current)].append(next)

In [35]:
len(trigram_transitions)

4145

The len() explodes.

In [48]:
list(trigram_transitions)[0:5] # some python searching led me to this approach. You cannot access items based on an index in a dict...

[("We'", 've'),
 ('ve', 'all'),
 ('all', 'heard'),
 ('heard', 'it'),
 ('it', 'according')]

## Applied on a webpage

- I messed around with both the scraping (using 'inspect' and selecting a 'div class' with a loose name) and then the creation of a list of words and `.`
- then the transitions is a defaultdict. 

In [54]:
content = soup.find('div', 'body__inner-container')
regex = r"[\w']+|[\.]"                       # matches a word or a period

document = []

for paragraph in content("p"):
    words = re.findall(regex, fix_unicode(paragraph.text))
    document.extend(words)
' '.join(document)



In [200]:
url = "https://pitchfork.com/reviews/albums/clark-sus-dog/"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

paragraphs = soup.find_all('p')
content = [p.get_text() for p in paragraphs]
content

['7.7',
 'By Harry Tafoya',
 'Genre:',
 'Electronic',
 'Label:',
 'Throttle',
 'Reviewed:',
 'May 31, 2023',
 'The architecture of Clark’s production has never sounded airier or more fluid than it does on his latest record,\xa0Sus Dog, where he foregrounds one instrument he has largely left in the margins: his voice. Executive produced by\xa0Thom Yorke,\xa0Sus Dog is warm and immediately gratifying, offering the musician’s fragile falsetto as a graceful counterpoint to his intricate and sometimes breakneck production.',
 'Historically, Clark’s experiments with voice have yielded mixed results. By turns angelic and menacing, the vocal accents on 2017’s\xa0Death Peak are crucial to that record’s apocalyptic appeal, while the garbled, ultra-processed growls and chanted raps on 2009’s\xa0Totems Flare have aged poorly. Here, rather than slapping his voice on top of the mix, Clark has learned to accommodate it. Working with a more limited palette of alternately boxy and lightspeed synths int

In [70]:
paragraphs

[<p class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-iATjmx iUEiRd hJnYqh crvVFm">7.7</p>,
 <p class="BylineWrapper-jWHrLH hOvThu byline bylines__byline" data-testid="BylineWrapper" itemprop="author" itemtype="http://schema.org/Person"><span class="BylineNamesWrapper-jbHncj fuDQVo" itemprop="name"><span class="BylineName-kwmrLn cYaBaU byline__name" data-testid="BylineName"><span class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ BylinePreamble-iJolpQ iUEiRd jSeRBj gnILss byline__preamble">By </span><a class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ BaseLink-eNWuiM BylineLink-gEnFiw iUEiRd caJArb GVLVC eErqIx byline__name-link button" href="/staff/harry-tafoya/">Harry Tafoya</a></span></span></p>,
 <p class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ InfoSliceKey-gHIvng iUEiRd YAAgl bWJknB">Genre:</p>,
 <p class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ InfoSliceValue-tfmqg iUEiRd iTpcbq fkSlPp">Electronic</p>,
 <p class="BaseWrap-sc-gjQpdd BaseText-ewhhUZ InfoSliceKey-gHIvng iUEiRd YAAgl bWJknB">Label:</p>,
 <p cl

p.get_text() is a beautifulsoup method!!

In [201]:
content_divs = soup.select('div[class*="body__inner-container"]')  # to be more selective 


content = [p.get_text() for paragraphs in content_divs for p in paragraphs('p')] # to avoid glueing paragraphs together
# [div.get_text() for div in content_divs]

content

 'The architecture of Clark’s production has never sounded airier or more fluid than it does on his latest record,\xa0Sus Dog, where he foregrounds one instrument he has largely left in the margins: his voice. Executive produced by\xa0Thom Yorke,\xa0Sus Dog is warm and immediately gratifying, offering the musician’s fragile falsetto as a graceful counterpoint to his intricate and sometimes breakneck production.',
 'Historically, Clark’s experiments with voice have yielded mixed results. By turns angelic and menacing, the vocal accents on 2017’s\xa0Death Peak are crucial to that record’s apocalyptic appeal, while the garbled, ultra-processed growls and chanted raps on 2009’s\xa0Totems Flare have aged poorly. Here, rather than slapping his voice on top of the mix, Clark has learned to accommodate it. Working with a more limited palette of alternately boxy and lightspeed synths interwoven with acoustic instruments,\xa0Sus Dog is an ornate but fleet-footed synth-pop album brimming with som

In [87]:
document = []

for paragraph in content:
    words = re.findall(regex, paragraph)
    document.extend(words)     # note that append creates a list per paragraph. Extend stays in the same list.
    
document

['Like',
 'the',
 'shifting',
 'atmosphere',
 'of',
 'a',
 'distant',
 'planet',
 'Chris',
 'Clark',
 's',
 'music',
 'is',
 'subject',
 'to',
 'violent',
 'extremes',
 '.',
 'With',
 'little',
 'a',
 'reassuring',
 'beat',
 'might',
 'furiously',
 'morph',
 'into',
 'an',
 'instrumental',
 'storm',
 'breaking',
 'just',
 'as',
 'suddenly',
 'into',
 'a',
 'diamond',
 'rain',
 'of',
 'twinkling',
 'synths',
 '.',
 'The',
 'British',
 'musician',
 's',
 'delight',
 'in',
 'wrong',
 'footing',
 'expectations',
 'has',
 'been',
 'one',
 'of',
 'the',
 'few',
 'constants',
 'in',
 'a',
 'career',
 'that',
 'has',
 'swerved',
 'wildly',
 'from',
 'tricky',
 'IDM',
 'to',
 'off',
 'kilter',
 'hip',
 'hop',
 'beats',
 'and',
 'from',
 'blistering',
 'techno',
 'to',
 'hushed',
 'minimalism',
 '.',
 'At',
 'his',
 'propulsive',
 'best',
 'Clark',
 'dazzles',
 'with',
 'both',
 'the',
 'density',
 'and',
 'dynamism',
 'of',
 'his',
 'music',
 '.',
 'But',
 'in',
 'contrast',
 'to',
 'the',
 'ex

In [88]:
transitions = defaultdict(list)

for prev, current in zip(document, document[1:]):
    transitions[prev].append(current)

transitions

defaultdict(list,
            {'Like': ['the'],
             'the': ['shifting',
              'few',
              'density',
              'explosive',
              'drama',
              'margins',
              'musician',
              'vocal',
              'garbled',
              'mix',
              'loveliest',
              'chaos',
              'track',
              'red',
              'exception',
              'tutelage',
              'similarity',
              'two',
              'lower',
              'sheer',
              'bridge',
              'shapes',
              'content',
              'songwriting',
              'mist',
              'record',
              'title',
              'song',
              'leap',
              'unknowability',
              'virtuous',
              'producer',
              'shortcoming',
              'fire',
              'strain',
              'closing',
              'zigzagging',
              'first',
            

In [90]:
generate_using_bigrams()

'In fact please carry on 2017 s all fuel to violent extremes it is tearing into an octave from The architecture of the virtuous but also of expressing the two floors which itself .'

# bigram in gensim

The book actually does something else than gensim's phrases. It's a Markov chain. **There are no good packages for this.**

A bigram model finds set expressions and puts underscores between them:

    The `Phrases` model in Gensim is a simple and efficient way to handle the transformation of individual words into bigrams (two consecutive words) or even larger n-grams, based on the frequency of word co-occurrence.

    The idea behind `Phrases` is that if two words are often found together in the text, they might be a meaningful phrase and it might be useful to treat them as a single entity. For example, "New York" is a bigram that represents a single entity, and "machine learning" might be a meaningful bigram in a text about data science.

    When you train a `Phrases` model on your text, it counts the frequency of each individual word and each pair of two consecutive words. If the frequency of the pair is significantly higher than would be expected based on the individual frequencies, the pair is considered to be a phrase.

    After training, you can use the `Phrases` model to transform any text into a text where these phrases are joined together with an underscore, effectively treating them as single words.

    For example, if you have the sentence "I live in New York and I work in machine learning", and "New York" and "machine learning" are recognized as phrases, the transformed sentence would be "I live in New_York and I work in machine_learning".

In [149]:
import gensim.downloader as api
from gensim import corpora, models

dataset = api.load("text8")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = models.phrases.Phrases(dataset, min_count=3, threshold=10)

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the', 'french_revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative_way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken_up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived_from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political_philosophy', 'is', 'the', 'belief_that', 'rulers', 'are', 'unnecessary', 'and', 'should_be', 'abolished', 'although', 'there_are', 'differing_interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers_to', 'related', 'social_movements', 'that', 'advocate',

In [150]:
api.info('text8')

{'num_records': 1701,
 'record_format': 'list of str (tokens)',
 'file_size': 33182058,
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py',
 'license': 'not found',
 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.',
 'checksum': '68799af40b6bda07dfa47a32612e5364',
 'file_name': 'text8.gz',
 'read_more': ['http://mattmahoney.net/dc/textdata.html'],
 'parts': 1}

In [170]:
bigram.corpus_word_count

17005207

# Topic Modelling

first: gibbs sampling. 
- this underlies the logic behind splitting up text in topics

In [97]:
from typing import Tuple, Dict, List
import random

def roll_a_die() -> int:
    return random.choice([1, 2, 3, 4, 5, 6])

def direct_sample() -> Tuple[int, int]:
    d1 = roll_a_die()
    d2 = roll_a_die()
    return d1, d1 + d2

def random_y_given_x(x: int) -> int:
    """equally likely to be x + 1, x + 2, ... , x + 6"""
    return x + roll_a_die()

def random_x_given_y(y: int) -> int:
    if y <= 7:
        # if the total is 7 or less, the first die is equally likely to be
        # 1, 2, ..., (total - 1)
        return random.randrange(1, y)
    else:
        # if the total is 7 or more, the first die is equally likely to be
        # (total - 6), (total - 5), ..., 6
        return random.randrange(y - 6, 7)

def gibbs_sample(num_iters: int = 100) -> Tuple[int, int]:
    x, y = 1, 2 # doesn't really matter
    for _ in range(num_iters):
        x = random_x_given_y(y)
        y = random_y_given_x(x)
    return x, y

def compare_distributions(num_samples: int = 1000) -> Dict[int, List[int]]:
    counts = defaultdict(lambda: [0, 0])
    for _ in range(num_samples):
        counts[gibbs_sample()][0] += 1
        counts[direct_sample()][1] += 1
    return counts

In [101]:
gibbs_sample()

(2, 3)

In [102]:
compare_distributions()

defaultdict(<function __main__.compare_distributions.<locals>.<lambda>()>,
            {(5, 8): [18, 39],
             (4, 8): [28, 23],
             (6, 9): [31, 29],
             (3, 8): [29, 23],
             (3, 9): [18, 30],
             (4, 5): [39, 28],
             (2, 8): [30, 33],
             (1, 7): [23, 27],
             (2, 4): [35, 34],
             (1, 6): [24, 27],
             (1, 4): [23, 37],
             (6, 11): [28, 30],
             (5, 9): [18, 37],
             (5, 7): [26, 28],
             (4, 9): [22, 27],
             (2, 6): [26, 19],
             (4, 10): [31, 30],
             (2, 5): [18, 31],
             (2, 3): [35, 28],
             (6, 10): [33, 26],
             (1, 5): [32, 21],
             (4, 6): [38, 23],
             (3, 5): [31, 26],
             (6, 7): [21, 34],
             (5, 11): [25, 22],
             (5, 6): [31, 35],
             (6, 8): [26, 26],
             (3, 6): [39, 27],
             (4, 7): [24, 25],
             (6, 12): 

In [108]:
def sample_from(weights: List[float]) -> int:
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total * random.random()      # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                       # return the smallest i such that
        if rnd <= 0: return i          # weights[0] + ... + weights[i] >= rnd

from collections import Counter

# Draw 1000 times and count
draws = Counter(sample_from([0.1, 0.1, 0.8]) for _ in range(1000))
assert 10 < draws[0] < 190   # should be ~10%, this is a really loose test
assert 10 < draws[1] < 190   # should be ~10%, this is a really loose test
assert 650 < draws[2] < 950  # should be ~80%, this is a really loose test
assert draws[0] + draws[1] + draws[2] == 1000

documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [119]:
# choose number of topics and initialise counters
K = 4

# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]

# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]

# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]

# a list of numbers, one for each document
document_lengths = [len(document) for document in documents]

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

In [118]:
f"so we have {len(distinct_words)} distinct words and {len(documents)} documents"

'so we have 36 distinct words and 15 documents'

In [None]:
def p_topic_given_document(topic: int, d: int, alpha: float = 0.1) -> float:
    """
    The fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)
    """
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word: str, topic: int, beta: float = 0.1) -> float:
    """
    The fraction of words assigned to _topic_
    that equal _word_ (plus some smoothing)
    """
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d: int, word: str, k: int) -> float:
    """
    Given a document and a word in that document,
    return the weight for the kth topic
    """
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d: int, word: str) -> int:
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

import tqdm

for iter in tqdm.trange(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [104]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0:
            print(k, word, count)

0 Java 3
0 Big Data 3
0 Hadoop 2
0 HBase 1
0 C++ 1
0 Spark 1
0 Storm 1
0 programming languages 1
0 MapReduce 1
0 Cassandra 1
0 deep learning 1
1 HBase 2
1 neural networks 2
1 Postgres 2
1 MongoDB 2
1 machine learning 2
1 Cassandra 1
1 numpy 1
1 decision trees 1
1 deep learning 1
1 databases 1
1 MySQL 1
1 NoSQL 1
1 artificial intelligence 1
1 scipy 1
2 regression 3
2 Python 2
2 R 2
2 libsvm 2
2 scikit-learn 2
2 mathematics 1
2 support vector machines 1
2 Haskell 1
2 Mahout 1
3 statistics 3
3 probability 3
3 Python 2
3 R 2
3 pandas 2
3 statsmodels 2
3 C++ 1
3 artificial intelligence 1
3 theory 1


In [106]:
topic_names = ["Big Data and programming languages",
               "Python and statistics",
               "databases",
               "machine learning"]

for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)
    print()

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Big Data and programming languages 7

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
Python and statistics 5

['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Python and statistics 2
databases 2
machine learning 2

['R', 'Python', 'statistics', 'regression', 'probability']
machine learning 3
databases 2

['machine learning', 'regression', 'decision trees', 'libsvm']
databases 2
Python and statistics 2

['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
databases 3
Big Data and programming languages 3

['statistics', 'probability', 'mathematics', 'theory']
machine learning 3
databases 1

['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
databases 2
Python and statistics 2

['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
Python and statistics 3
Big Data and programming languages 1

['Hadoop', 'Java', 'MapReduce', 'Big Data']
Big D

## Alternatively

In [127]:
# some sample documents
documents = [
    "The sky is blue",
    "Blue and bright day",
    "Beautiful blue birds in the sky",
    "The dog is in the garden",
    "Cats and dogs are great pets",
    "Flowers in the garden are beautiful",
    "I love gardening"
]
documents = [document.split() for document in documents]

In [128]:
documents

[['The', 'sky', 'is', 'blue'],
 ['Blue', 'and', 'bright', 'day'],
 ['Beautiful', 'blue', 'birds', 'in', 'the', 'sky'],
 ['The', 'dog', 'is', 'in', 'the', 'garden'],
 ['Cats', 'and', 'dogs', 'are', 'great', 'pets'],
 ['Flowers', 'in', 'the', 'garden', 'are', 'beautiful'],
 ['I', 'love', 'gardening']]

In [129]:
K = 3

# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]

# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]

# a list of numbers, one for each document
document_lengths = [len(document) for document in documents]

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

f"so we have {len(distinct_words)} distinct words and {len(documents)} documents"

'so we have 24 distinct words and 7 documents'

In [130]:
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

import tqdm

for iter in tqdm.trange(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1


100%|██████████| 1000/1000 [00:00<00:00, 4641.76it/s]


In [131]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0:
            print(k, word, count)

0 and 2
0 dogs 1
0 pets 1
0 I 1
0 gardening 1
0 day 1
1 sky 2
1 the 2
1 blue 1
1 Beautiful 1
1 garden 1
1 birds 1
1 Flowers 1
1 Blue 1
2 in 3
2 are 2
2 is 2
2 The 2
2 the 1
2 garden 1
2 Cats 1
2 bright 1
2 dog 1
2 beautiful 1
2 love 1
2 great 1
2 blue 1


In [133]:
# topic_names = ["Big Data and programming languages",
#                "Python and statistics",
#                "databases",
#                "machine learning"]

for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count in topic_counts.most_common():
        if count > 0:
            print(topic, count)
    print()

['The', 'sky', 'is', 'blue']
2 14
1 5

['Blue', 'and', 'bright', 'day']
2 12
1 7
0 4

['Beautiful', 'blue', 'birds', 'in', 'the', 'sky']
2 22
1 15

['The', 'dog', 'is', 'in', 'the', 'garden']
2 25
0 4
1 1

['Cats', 'and', 'dogs', 'are', 'great', 'pets']
2 23
0 11

['Flowers', 'in', 'the', 'garden', 'are', 'beautiful']
2 25
1 15
0 1

['I', 'love', 'gardening']
2 11
0 8



Using Gensim, the package for topic modelling

In [135]:
import gensim
from gensim import corpora

# Let's start with some sample documents
documents = [
    "The sky is blue",
    "Blue and bright day",
    "Beautiful blue birds in the sky",
    "The dog is in the garden",
    "Cats and dogs are great pets",
    "Flowers in the garden are beautiful",
    "I love gardening"
]

# Preprocessing: Tokenize the documents, remove common words as well as words that only appear once
texts = [[word for word in document.lower().split()] for document in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert the dictionary into a bag-of-words
corpus = [dictionary.doc2bow(text) for text in texts] # this is a bag-of-words corpus

# Train the LDA model
lda = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)

# Print the topics
print(lda.print_topics(num_topics=2, num_words=4))


[(0, '0.084*"and" + 0.078*"are" + 0.076*"cats" + 0.076*"great"'), (1, '0.152*"the" + 0.097*"in" + 0.096*"blue" + 0.069*"beautiful"')]


Each word is accompanied by a number, which is the weight of the word in that topic. 

The weight indicates how important that word is in defining the topic.

In [138]:
print(dictionary)

Dictionary<21 unique tokens: ['blue', 'is', 'sky', 'the', 'and']...>


In [139]:
print(dictionary.token2id)

{'blue': 0, 'is': 1, 'sky': 2, 'the': 3, 'and': 4, 'bright': 5, 'day': 6, 'beautiful': 7, 'birds': 8, 'in': 9, 'dog': 10, 'garden': 11, 'are': 12, 'cats': 13, 'dogs': 14, 'great': 15, 'pets': 16, 'flowers': 17, 'gardening': 18, 'i': 19, 'love': 20}


## on the content from the pitchfork review above

In [37]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [141]:
from gensim.utils import simple_preprocess

# using the content from the pitchfork review above
preprocessed = [simple_preprocess(line, deacc=True) for line in content] 

# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(preprocessed)

# Token to Id map
simple_preprocess

{'above': 0,
 'accents': 1,
 'accentuated': 2,
 'accommodate': 3,
 'acoustic': 4,
 'aged': 5,
 'airier': 6,
 'album': 7,
 'also': 8,
 'alternately': 9,
 'amassed': 10,
 'an': 11,
 'and': 12,
 'angelic': 13,
 'another': 14,
 'antagonistic': 15,
 'any': 16,
 'apart': 17,
 'apocalyptic': 18,
 'appeal': 19,
 'arca': 20,
 'architecture': 21,
 'are': 22,
 'as': 23,
 'at': 24,
 'atmosphere': 25,
 'beat': 26,
 'beatific': 27,
 'beats': 28,
 'bed': 29,
 'been': 30,
 'before': 31,
 'best': 32,
 'between': 33,
 'bjork': 34,
 'blaring': 35,
 'blistering': 36,
 'body': 37,
 'both': 38,
 'box': 39,
 'boxy': 40,
 'breaking': 41,
 'breakneck': 42,
 'bridge': 43,
 'brimming': 44,
 'brings': 45,
 'british': 46,
 'bully': 47,
 'but': 48,
 'by': 49,
 'career': 50,
 'changes': 51,
 'chanted': 52,
 'chaos': 53,
 'chris': 54,
 'clark': 55,
 'clutch': 56,
 'coach': 57,
 'colonized': 58,
 'constants': 59,
 'contrast': 60,
 'corroded': 61,
 'counterpoint': 62,
 'crank': 63,
 'crucial': 64,
 'dazzles': 65,
 'dea

In [144]:
mycorpus = [dictionary.doc2bow(doc) for doc in preprocessed]
mycorpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 4),
  (12, 10),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 4),
  (24, 3),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 3),
  (49, 3),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 9),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1

In [145]:
# to make it slightly more readable
[[(dictionary[id], count) for id, count in line] for line in mycorpus]

[[('above', 1),
  ('accents', 1),
  ('accentuated', 1),
  ('accommodate', 1),
  ('acoustic', 1),
  ('aged', 1),
  ('airier', 1),
  ('album', 1),
  ('also', 1),
  ('alternately', 1),
  ('amassed', 1),
  ('an', 4),
  ('and', 10),
  ('angelic', 1),
  ('another', 1),
  ('antagonistic', 1),
  ('any', 1),
  ('apart', 1),
  ('apocalyptic', 1),
  ('appeal', 1),
  ('arca', 1),
  ('architecture', 1),
  ('are', 1),
  ('as', 4),
  ('at', 3),
  ('atmosphere', 1),
  ('beat', 1),
  ('beatific', 1),
  ('beats', 2),
  ('bed', 1),
  ('been', 1),
  ('before', 1),
  ('best', 1),
  ('between', 1),
  ('bjork', 1),
  ('blaring', 1),
  ('blistering', 1),
  ('body', 1),
  ('both', 2),
  ('box', 1),
  ('boxy', 1),
  ('breaking', 1),
  ('breakneck', 1),
  ('bridge', 1),
  ('brimming', 1),
  ('brings', 1),
  ('british', 1),
  ('bully', 1),
  ('but', 3),
  ('by', 3),
  ('career', 1),
  ('changes', 1),
  ('chanted', 1),
  ('chaos', 1),
  ('chris', 1),
  ('clark', 9),
  ('clutch', 1),
  ('coach', 1),
  ('colonized',

To get rid of stop words use `[word for word if not in stop_words]` if it's a list

In [146]:
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\johan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [148]:
len(stop_words)

179

to lemmatize words:

In [216]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize Single Word
print(lemmatizer.lemmatize("bats"))
#> bat

print(lemmatizer.lemmatize("are"))
#> are

print(lemmatizer.lemmatize("feet"))
#> foot

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\johan\AppData\Roaming\nltk_data...


bat
are
foot


In [217]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [220]:
# Lemmatize list of words, remove stop_words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list if w not in stop_words])
print(lemmatized_output)

The striped bat hanging foot best


In [229]:
preprocessed[0][0]

'like'

In [243]:
len(preprocessed[0])

499

In [265]:
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Step 2: Prepare Data (Remove stopwords and lemmatize)
data_processed = []

for i, doc in enumerate(preprocessed[0:2][:20]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatizer.lemmatize(wd)  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word]
        else:
            continue
    data_processed.append(doc_out)
    
print(data_processed[0][:5]) 

['like', 'shifting', 'atmosphere', 'distant', 'planet']


In [268]:
# Step 3: Create the Inputs of LDA model: Dictionary and Corpus
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

In [283]:
# Step 4: Train the LDA model
from gensim.models import LdaModel, LdaMulticore

lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1) # -1 is to rank it from 1 up

[(0,
  '0.018*"clark" + 0.012*"voice" + 0.011*"music" + 0.009*"yorke" + 0.008*"record" + 0.007*"one" + 0.007*"production" + 0.007*"beat" + 0.007*"dog" + 0.007*"sus"'),
 (1,
  '0.003*"alternately" + 0.003*"constant" + 0.003*"may" + 0.003*"ultra" + 0.003*"reference" + 0.003*"pearler" + 0.003*"planet" + 0.003*"bright" + 0.003*"hip" + 0.003*"earn"'),
 (2,
  '0.003*"developed" + 0.003*"vocal" + 0.003*"morph" + 0.003*"acoustic" + 0.003*"situated" + 0.003*"range" + 0.003*"constant" + 0.003*"eyed" + 0.003*"appear" + 0.003*"might"'),
 (3,
  '0.014*"clark" + 0.011*"like" + 0.008*"voice" + 0.007*"sus" + 0.007*"dog" + 0.007*"one" + 0.005*"refrain" + 0.005*"production" + 0.005*"wild" + 0.005*"ladder"'),
 (4,
  '0.003*"virtuous" + 0.003*"emotional" + 0.003*"scorching" + 0.003*"clark" + 0.003*"morph" + 0.003*"two" + 0.003*"striking" + 0.003*"like" + 0.003*"crucial" + 0.003*"loveliest"'),
 (5,
  '0.003*"dog" + 0.003*"beautiful" + 0.003*"recognizing" + 0.003*"pop" + 0.003*"clark" + 0.003*"one" + 0.003*

In [282]:
lda_model[corpus[0]] # to show the phi values

([(0, 0.9974514)],
 [(0, [0]),
  (1, [0]),
  (2, [0]),
  (3, [0]),
  (4, [0]),
  (5, [0]),
  (6, [0]),
  (7, [0]),
  (8, [0]),
  (9, [0]),
  (10, [0]),
  (11, [0]),
  (12, [0]),
  (13, [0]),
  (14, [0]),
  (15, [0]),
  (16, [0]),
  (17, [0]),
  (18, [0]),
  (19, [0]),
  (20, [0]),
  (21, [0]),
  (22, [0]),
  (23, [0]),
  (24, [0]),
  (25, [0]),
  (26, [0]),
  (27, [0]),
  (28, [0]),
  (29, [0]),
  (30, [0]),
  (31, [0]),
  (32, [0]),
  (33, [0]),
  (34, [0]),
  (35, [0]),
  (36, [0]),
  (37, [0]),
  (38, [0]),
  (39, [0]),
  (40, [0]),
  (41, [0]),
  (42, [0]),
  (43, [0]),
  (44, [0]),
  (45, [0]),
  (46, [0]),
  (47, [0]),
  (48, [0]),
  (49, [0]),
  (50, [0]),
  (51, [0]),
  (52, [0]),
  (53, [0]),
  (54, [0]),
  (55, [0]),
  (56, [0]),
  (57, [0]),
  (58, [0]),
  (59, [0]),
  (60, [0]),
  (61, [0]),
  (62, [0]),
  (63, [0]),
  (64, [0]),
  (65, [0]),
  (66, [0]),
  (67, [0]),
  (68, [0]),
  (69, [0]),
  (70, [0]),
  (71, [0]),
  (72, [0]),
  (73, [0]),
  (74, [0]),
  (75, [0]),
  (

not super interesting: Phi value is the probability of the word belonging to that particular topic. And the sum of phi values for a given word adds up to the number of times that word occurred in that document. https://www.machinelearningplus.com/nlp/gensim-tutorial/

## Alternative model:

In [273]:
from gensim.models import LdaModel

lda2 = LdaModel(corpus=corpus,
                id2word=dct,
                random_state=100,
                num_topics=7)

lda2.print_topics(-1) # -1 is to rank it from 1 up

[(0,
  '0.018*"clark" + 0.012*"music" + 0.011*"voice" + 0.009*"one" + 0.008*"yorke" + 0.008*"production" + 0.008*"dog" + 0.007*"sus" + 0.007*"record" + 0.007*"beat"'),
 (1,
  '0.018*"clark" + 0.010*"voice" + 0.009*"music" + 0.007*"sus" + 0.007*"dog" + 0.007*"one" + 0.007*"production" + 0.006*"like" + 0.006*"yorke" + 0.005*"record"'),
 (2,
  '0.013*"clark" + 0.009*"voice" + 0.007*"sus" + 0.007*"like" + 0.006*"record" + 0.006*"one" + 0.006*"music" + 0.006*"might" + 0.006*"production" + 0.005*"dog"'),
 (3,
  '0.021*"clark" + 0.016*"voice" + 0.009*"record" + 0.009*"sus" + 0.008*"music" + 0.008*"yorke" + 0.007*"production" + 0.007*"dog" + 0.006*"one" + 0.006*"like"'),
 (4,
  '0.015*"clark" + 0.009*"like" + 0.007*"voice" + 0.007*"one" + 0.006*"dog" + 0.006*"sus" + 0.006*"music" + 0.005*"production" + 0.005*"two" + 0.005*"might"'),
 (5,
  '0.014*"clark" + 0.010*"like" + 0.010*"dog" + 0.008*"one" + 0.007*"sus" + 0.007*"voice" + 0.006*"production" + 0.005*"music" + 0.005*"title" + 0.005*"might"

Latent Semantic Indexing (LSI)

In [271]:
from gensim.models import LsiModel
import pprint

# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)

# View Topics
pprint.pprint(lsi_model.print_topics(-1))

[(0,
  '0.446*"clark" + 0.276*"voice" + 0.198*"music" + 0.170*"dog" + 0.170*"sus" + '
  '0.170*"one" + 0.162*"record" + 0.149*"production" + 0.143*"like" + '
  '0.141*"yorke"'),
 (1,
  '-0.237*"like" + 0.132*"yorke" + 0.111*"music" + -0.108*"approach" + '
  '-0.108*"dismissive" + -0.108*"refrain" + -0.108*"mournful" + -0.108*"leap" '
  '+ -0.108*"ladder" + -0.108*"songwriting"')]


# Word2vec

Gensim’s Word2Vec implementation lets you train your own word embedding model for a given corpus.

A word embedding model is a model that can provide numerical vectors for a given word. Using the Gensim’s downloader API, you can download pre-built word embedding models like word2vec, fasttext, GloVe and ConceptNet. These are built on large corpuses of commonly occurring text data such as wikipedia, google news etc.


Start [here - the official tutorials](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py)

In [17]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

In [20]:
# Get the word vector for given word
model.wv['topic']

array([ 0.9205213 ,  0.27776983, -0.05177611,  0.46860605, -0.8798394 ,
       -1.3324406 , -1.435962  ,  0.32900223,  0.5565165 ,  0.26718137,
        0.11481123,  0.89353824, -1.1467557 , -1.0474638 , -0.99056137,
        1.1820714 , -1.2716994 , -0.51987416,  1.2844187 ,  0.4771293 ,
        0.25608808,  0.9423585 , -0.2810102 ,  0.41437182, -0.43062216,
        0.25769383, -0.09235616, -0.21599683, -0.09595884,  0.10432298,
        1.2329164 , -0.14473377, -0.90032524, -1.2211845 ,  0.0586639 ,
       -0.20559865, -1.729614  , -0.9255956 ,  0.12454402, -0.5193946 ,
       -0.05347501, -0.6232367 , -0.16193298, -0.6031882 ,  0.22658087,
        0.4826208 , -0.48977122,  0.57894325, -0.75426376,  0.9157322 ,
        0.25455776, -0.44881257, -0.17192224, -0.45753956, -0.6101924 ,
       -0.35173488, -0.06816396, -0.3276552 , -1.3871515 ,  1.0855129 ,
        0.78473115,  0.7800539 ,  0.13769488,  0.0279958 , -0.68402493,
       -0.6093991 ,  0.83312124, -0.2895124 , -1.0896566 , -0.04

In [22]:
model.wv.most_similar('topic')

[('interpretation', 0.7315137982368469),
 ('discussion', 0.6999590992927551),
 ('discourse', 0.6952456831932068),
 ('characterization', 0.6894254684448242),
 ('consensus', 0.685240626335144),
 ('debate', 0.6841682195663452),
 ('explanation', 0.6839592456817627),
 ('premise', 0.6786676049232483),
 ('focus', 0.670373797416687),
 ('speculation', 0.6699302196502686)]

In [None]:
# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

In [23]:
# so many tutorials don't work - good to check: version!
import gensim
print(gensim.__version__)

4.3.1


In [286]:
from gensim.summarization import summarize, keywords
from pprint import pprint

summarize(content)

ModuleNotFoundError: No module named 'gensim.summarization'

In [None]:
text = " ".join((line for line in content))

# Summarize the paragraph
pprint(summarize(text, word_count=20))
#> ('the PLA Rocket Force national defense science and technology experts panel, '
#>  'according to a report published by the')

# Important keywords from the paragraph
print(keywords(text))
#> force zhang technology experts pla rocket

# sentiment

a quick example from chatgpt for sentiment analysis

In [290]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

sentence = r"this is shit"
# "NLTK is a great library for natural language processing!"

polarity = sia.polarity_scores(sentence)

print(polarity)

{'neg': 0.643, 'neu': 0.357, 'pos': 0.0, 'compound': -0.5574}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\johan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



The `polarity_scores` method returns a dictionary with the negative, neutral, positive, and compound sentiment scores. The compound score is a single metric that calculates the sum of all the lexicon ratings and normalizes it between -1 (most extreme negative) and +1 (most extreme positive).

Please note that VADER is best used for language used in social media, like short sentences with some slang and abbreviations. It's not as good for longer texts with more formal language.


## back to [the official gensim tutorials](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#core-concepts-corpus)

In [24]:
# a corpus is like this, a list of "documents":

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

`simple_preprocess` converts a `document` into a list of lowercase tokens, ignoring tokens that are too short or too long.

In [25]:
from gensim.utils import simple_preprocess
docs = []
for doc in text_corpus:
    text = simple_preprocess(doc)
    docs.append(text)
    
docs

[['human',
  'machine',
  'interface',
  'for',
  'lab',
  'abc',
  'computer',
  'applications'],
 ['survey',
  'of',
  'user',
  'opinion',
  'of',
  'computer',
  'system',
  'response',
  'time'],
 ['the', 'eps', 'user', 'interface', 'management', 'system'],
 ['system', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'eps'],
 ['relation',
  'of',
  'user',
  'perceived',
  'response',
  'time',
  'to',
  'error',
  'measurement'],
 ['the', 'generation', 'of', 'random', 'binary', 'unordered', 'trees'],
 ['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'],
 ['graph',
  'minors',
  'iv',
  'widths',
  'of',
  'trees',
  'and',
  'well',
  'quasi',
  'ordering'],
 ['graph', 'minors', 'survey']]

**More preprocessing:**
- omit stopwords
- frequency count
- omit words that appear only once

In [26]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
print(processed_corpus)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


We want to associate each word in the corpus with a unique integer ID. We can do this using the gensim.corpora.Dictionary class. This dictionary defines the vocabulary of all words that our processing knows about.

In [27]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


the dictionary associates each word in the corpus with a unique integer ID.

In [30]:
import pprint
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


## Aside on streaming

To avoid download in RAM: **Corpus Streaming** – One Document at a Time

Gensim only requires that a corpus must be able to return one document vector at a time.

The full power of Gensim comes from the fact that a corpus doesn’t have to be a list, or a NumPy array, or a Pandas dataframe, or whatever. Gensim accepts any object that, when iterated over, successively yields documents.

you can mold the __iter__ function to fit your input format, whatever it is. Walking directories, parsing XML, accessing the network… Just parse your input to retrieve a clean list of tokens in each document, then convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside __iter__.

In [28]:
from smart_open import open  # for transparently opening remote files

class MyCorpus:
    def __iter__(self):
        for line in open('https://radimrehurek.com/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())
            
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!

for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


Similarly, to construct the dictionary without loading all texts into memory:

In [38]:
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
    ]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

2023-06-02 12:33:36,634 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-06-02 12:33:36,635 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)
2023-06-02 12:33:36,636 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)", 'datetime': '2023-06-02T12:33:36.636066', 'gensim': '4.3.1', 'python': '3.10.9 | packaged by Anaconda, Inc. | (main, Mar  8 2023, 10:42:25) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


## Back to tutorials:

`doc2bow` takes the IDs from the dictionary and transforms any document it is fed based on them. 

1. We can apply a new document to an existing dictionary:

In [32]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


Note that the first entry in the tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token. “interaction” did not occur in the original corpus and so it was not included in the vectorization. 

2. We can also apply it to the entire corpus. This converts it to a list of vectors:

In [31]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


## Models

These transform one document representation to another.

A simple example is `tf-idf`:

(short for term frequency–inverse document frequency)

In [40]:
from gensim import models

# initialise and train the model
tfidf = models.TfidfModel(bow_corpus)

2023-06-02 12:35:34,728 : INFO : collecting document frequencies
2023-06-02 12:35:34,731 : INFO : PROGRESS: processing document #0
2023-06-02 12:35:34,732 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2023-06-02T12:35:34.732126', 'gensim': '4.3.1', 'python': '3.10.9 | packaged by Anaconda, Inc. | (main, Mar  8 2023, 10:42:25) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'initialize'}


In [44]:
# use the model to transform a new string 
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


The tfidf model again returns a list of tuples, where the first entry is the token ID and the second entry is the tf-idf weighting. Note that the ID corresponding to “system” (which occurred 4 times in the original corpus) has been weighted lower than the ID corresponding to “minors” (which only occurred twice).

In [45]:
# us the model to transform any other bow
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])  

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [47]:
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


note that a model, like `tfidf[bow_corpus]`, can only be iterated over.

In [49]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)  # initialize an LSI transformation

corpus_lsi = lsi_model[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

2023-06-02 12:42:00,441 : INFO : using serial LSI version on this node
2023-06-02 12:42:00,442 : INFO : updating model with new documents
2023-06-02 12:42:00,443 : INFO : preparing a new chunk of documents
2023-06-02 12:42:00,444 : INFO : using 100 extra samples and 2 power iterations
2023-06-02 12:42:00,444 : INFO : 1st phase: constructing (12, 102) action matrix
2023-06-02 12:42:00,446 : INFO : orthonormalizing (12, 102) action matrix
2023-06-02 12:42:00,451 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2023-06-02 12:42:00,454 : INFO : computing the final decomposition
2023-06-02 12:42:00,455 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2023-06-02 12:42:00,457 : INFO : processed documents up to #9
2023-06-02 12:42:00,458 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2023-06-02 12:42:00,459 : INFO : topic #

So `corpus_lsi` applies the LSI transformation to the TF-IDF representation. The original corpus is "double-wrapped" by these two transformations (TF-IDF and LSI), one applied after the other. 

This is a common practice in text processing and Natural Language Processing (NLP) to extract more meaningful and condensed information from the original text data.

In [50]:
lsi_model.print_topics(2)

2023-06-02 12:46:04,099 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2023-06-02 12:46:04,101 : INFO : topic #1(1.476): 0.460*"system" + 0.373*"user" + 0.332*"eps" + 0.328*"interface" + 0.320*"time" + 0.320*"response" + 0.293*"computer" + 0.280*"human" + 0.171*"survey" + -0.161*"trees"


[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '0.460*"system" + 0.373*"user" + 0.332*"eps" + 0.328*"interface" + 0.320*"time" + 0.320*"response" + 0.293*"computer" + 0.280*"human" + 0.171*"survey" + -0.161*"trees"')]

In [52]:
for doc, as_text in zip(corpus_lsi, text_corpus):
    print(doc, as_text)

[(0, 0.06600783396090422), (1, 0.5200703306361851)] Human machine interface for lab abc computer applications
[(0, 0.19667592859142566), (1, 0.7609563167700046)] A survey of user opinion of computer system response time
[(0, 0.08992639972446491), (1, 0.7241860626752508)] The EPS user interface management system
[(0, 0.07585847652178185), (1, 0.6320551586003428)] System and human system engineering testing of EPS
[(0, 0.10150299184980142), (1, 0.5737308483002954)] Relation of user perceived response time to error measurement
[(0, 0.7032108939378311), (1, -0.16115180214025854)] The generation of random binary unordered trees
[(0, 0.8774787673119835), (1, -0.16758906864659506)] The intersection graph of paths in trees
[(0, 0.9098624686818582), (1, -0.14086553628719128)] Graph minors IV Widths of trees and well quasi ordering
[(0, 0.6165825350569288), (1, 0.05392907566389275)] Graph minors A survey


So while `tfidf` keeps the number of dimensions intact but only weights towards rarity, `lsi` reduces dimensions to whatever number of topics you may want to extract.

## Similarities

In [42]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

# another string to check similarity
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]

print(list(enumerate(sims)))

2023-06-02 12:37:47,376 : INFO : creating sparse index
2023-06-02 12:37:47,377 : INFO : creating sparse matrix from corpus
2023-06-02 12:37:47,377 : INFO : PROGRESS: at document #0
2023-06-02 12:37:47,379 : INFO : created <9x12 sparse matrix of type '<class 'numpy.float32'>'
	with 28 stored elements in Compressed Sparse Row format>


[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [43]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


Note that the class similarities.MatrixSimilarity is only appropriate when the whole set of vectors fits into memory. 

In [55]:
from gensim import similarities

index = similarities.MatrixSimilarity(lsi_model[bow_corpus])  # transform corpus to LSI space and index it

2023-06-02 12:50:00,947 : INFO : creating matrix with 9 documents and 2 features


In [56]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi_model[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 0.07910475117444915), (1, 0.5732835243079404)]


In [57]:
sims = index[vec_lsi]  # perform a similarity query against the corpus

print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.9999408), (1, 0.9946708), (2, 0.9999428), (3, 0.999879), (4, 0.99935204), (5, -0.08804217), (6, -0.0515742), (7, -0.023664713), (8, 0.1938726)]


In [62]:
sims

[(2, 0.9999428),
 (0, 0.9999408),
 (3, 0.999879),
 (4, 0.99935204),
 (1, 0.9946708),
 (8, 0.1938726),
 (7, -0.023664713),
 (6, -0.0515742),
 (5, -0.08804217)]

In [63]:
sims = sorted(sims, key=lambda item: -item[1])
for doc_position, doc_score in sims:
    print(doc_score, text_corpus[doc_position])

0.9999428 The EPS user interface management system
0.9999408 Human machine interface for lab abc computer applications
0.999879 System and human system engineering testing of EPS
0.99935204 Relation of user perceived response time to error measurement
0.9946708 A survey of user opinion of computer system response time
0.1938726 Graph minors A survey
-0.023664713 Graph minors IV Widths of trees and well quasi ordering
-0.0515742 The intersection graph of paths in trees
-0.08804217 The generation of random binary unordered trees


In [75]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices()))

Num GPUs Available:  1


Next steps:

- https://spacy.io/usage/embeddings-transformers
- http://karpathy.github.io/2015/05/21/rnn-effectiveness/ 