In [2]:
# Given a corpus, generate term context matrix for a window specified window size (w=1,..,10) using 1) count matrix 2) Pointwise mutual information matrix 3) Calculate confine similarity between any two words


import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import dok_matrix
from sklearn.metrics.pairwise import cosine_similarity

def generate_term_context_matrix(corpus_file, window_size):
    # Step 1: Read the corpus from text file
    with open(corpus_file, 'r', encoding='utf-8', errors='ignore') as file:
        corpus = file.read().split()
        
    # Step 2: Generate count matrix
    vectorizer = CountVectorizer()
    count_matrix = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names_out()

    # Step 3: Generate term-context matrix
    term_context_matrix = dok_matrix((len(terms), len(terms)), dtype=np.float32)
    term_index = {term: idx for idx, term in enumerate(terms)}
    for i, token in enumerate(corpus):
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(corpus))):
            if i != j:
                context = corpus[j]
                if token in term_index and context in term_index:
                    term_context_matrix[term_index[token], term_index[context]] += 1

    # Step 4: Calculate PMI matrix
    pmi_matrix = dok_matrix((len(terms), len(terms)), dtype=np.float32)
    total_count = count_matrix.sum()
    term_freq = count_matrix.sum(axis=0) + 1  # Add smoothing factor
    context_freq = count_matrix.sum(axis=1) + 1  # Add smoothing factor
    for term, term_idx in term_index.items():
        for context, context_idx in term_index.items():
            pmi = np.log2((term_context_matrix[term_idx, context_idx] + 1) * total_count /
                          (term_freq[0, term_idx] * context_freq[context_idx, 0]))
            pmi_matrix[term_idx, context_idx] = max(0, pmi)

    # Step 5: Calculate cosine similarity between any two words
    similarity_matrix = cosine_similarity(pmi_matrix)

    return count_matrix, term_context_matrix, pmi_matrix, similarity_matrix


def main():
    # Example usage
    corpus_file = 'transcript13.txt'
    window_size = 2

    count_matrix, term_context_matrix, pmi_matrix, similarity_matrix = generate_term_context_matrix(corpus_file, window_size)

    # Print count matrix
    print("Count Matrix:")
    print(count_matrix.toarray())

    # Print term-context matrix
    print("\nTerm-Context Matrix:")
    print(term_context_matrix.toarray())

    # Print PMI matrix
    print("\nPMI Matrix:")
    print(pmi_matrix.toarray())

    # Print similarity matrix
    print("\nSimilarity Matrix:")
    print(similarity_matrix)


if __name__ == '__main__':
    main()


Count Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Term-Context Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

PMI Matrix:
[[9.100662 8.100662 8.100662 ... 8.100662 8.100662 8.100662]
 [9.100662 8.100662 8.100662 ... 8.100662 8.100662 8.100662]
 [9.100662 8.100662 8.100662 ... 8.100662 8.100662 8.100662]
 ...
 [9.100662 8.100662 8.100662 ... 8.100662 8.100662 8.100662]
 [8.100662 7.100662 7.100662 ... 7.100662 7.100662 7.100662]
 [9.100662 8.100662 8.100662 ... 8.100662 8.100662 8.100662]]

Similarity Matrix:
[[0.99999785 0.99995184 0.99999785 ... 0.9999782  0.99998534 0.99999785]
 [0.99995184 0.99999666 0.99995184 ... 0.9999224  0.9999298  0.99995184]
 [0.99999785 0.99995184 0.99999785 ... 0.9999782  0.99998534 0.99999785]
 ...
 [0.9999782  0.9999224  0.9999782  ... 0.99999464 0.9999529