# Co- Occurence

In [1]:
import numpy as np
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
sample_txt = "Alice is a very pretty girl . She  is shy , smart , brilliant , positive , optimistic woman . Aakaasa is her twin sister . She is bold , fearless , pessimistic , beautiful , gorgeous . "

# Tokenize into words and sentences

In [17]:
sentences = sent_tokenize(sample_txt)

In [18]:
sentences

['Alice is a very pretty girl .',
 'She  is shy , smart , brilliant , positive , optimistic woman .',
 'Aakaasa is her twin sister .',
 'She is bold , fearless , pessimistic , beautiful , gorgeous .']

In [19]:
words_in_sentence = [word_tokenize(sentence.lower()) for sentence in sentences]

In [20]:
words_in_sentence

[['alice', 'is', 'a', 'very', 'pretty', 'girl', '.'],
 ['she',
  'is',
  'shy',
  ',',
  'smart',
  ',',
  'brilliant',
  ',',
  'positive',
  ',',
  'optimistic',
  'woman',
  '.'],
 ['aakaasa', 'is', 'her', 'twin', 'sister', '.'],
 ['she',
  'is',
  'bold',
  ',',
  'fearless',
  ',',
  'pessimistic',
  ',',
  'beautiful',
  ',',
  'gorgeous',
  '.']]

# Build Co-occurence matrix

In [21]:
from collections import defaultdict
import pandas as pd

print("Build co-occurrence matrix")

window_size = 2
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))

for sentence in words_in_sentence:
    for i, target_word in enumerate(sentence):
        for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
            if i != j:
                context_word = sentence[j]
                co_occurrence_matrix[target_word][context_word] += 1

unique_words = sorted({w for sent in words_in_sentence for w in sent})
matrix_df = pd.DataFrame(index=unique_words, columns=unique_words).fillna(0)

for target_word, contexts in co_occurrence_matrix.items():
    for context_word, count in contexts.items():
        matrix_df.at[target_word, context_word] = count

print(matrix_df)


Build co-occurrence matrix
              ,  .  a  aakaasa  alice  beautiful  bold  brilliant  fearless  \
,            12  1  0        0      0          2     1          2         2   
.             1  0  0        0      0          0     0          0         0   
a             0  0  0        0      1          0     0          0         0   
aakaasa       0  0  0        0      0          0     0          0         0   
alice         0  0  1        0      0          0     0          0         0   
beautiful     2  0  0        0      0          0     0          0         0   
bold          1  0  0        0      0          0     0          0         1   
brilliant     2  0  0        0      0          0     0          0         0   
fearless      2  0  0        0      0          0     1          0         0   
girl          0  1  0        0      0          0     0          0         0   
gorgeous      1  1  0        0      0          1     0          0         0   
her           0  0  0    

  matrix_df = pd.DataFrame(index=unique_words, columns=unique_words).fillna(0)


## Another example

In [23]:
from collections import defaultdict
import pandas as pd

words_in_sentence = [
    ["i", "love", "machine", "learning"],
    ["machine", "learning", "is", "fun"],
    ["i", "love", "python"]
]

window_size = 2
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))

for sentence in words_in_sentence:
    for i, target_word in enumerate(sentence):
        for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
            if i != j:
                context_word = sentence[j]
                co_occurrence_matrix[target_word][context_word] += 1

unique_words = sorted({w for sent in words_in_sentence for w in sent})
matrix_df = pd.DataFrame(index=unique_words, columns=unique_words).fillna(0)

for target_word, contexts in co_occurrence_matrix.items():
    for context_word, count in contexts.items():
        matrix_df.at[target_word, context_word] = count

print(matrix_df)


          fun  i  is  learning  love  machine  python
fun         0  0   1         1     0        0       0
i           0  0   0         0     2        1       1
is          1  0   0         1     0        1       0
learning    1  0   1         0     1        2       0
love        0  2   0         1     0        1       1
machine     0  1   1         2     1        0       0
python      0  1   0         0     1        0       0


  matrix_df = pd.DataFrame(index=unique_words, columns=unique_words).fillna(0)
