# Manish Kanuri
# NUID: 002315456

# importing libraries


In [293]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [294]:
topics = ['comp.graphics', 'comp.sys.ibm.pc.hardware', 'rec.autos', 'rec.sport.baseball',
          'rec.sport.hockey', 'sci.space', 'sci.med', 'sci.electronics', 'talk.politics.guns',
          'talk.politics.mideast']

In [295]:
# Loading dataset
newsgroups = fetch_20newsgroups(subset='all', categories=topics, remove=('headers', 'footers', 'quotes'))

In [296]:

if not newsgroups.data:
    raise ValueError("Dataset is empty. Check selected categories.")


# Task 1: Applying SVD on Term-Document Matrices

In [318]:

print("Sample document:", newsgroups.data[5][:500], "\n")

Sample document:  
I agree. Adding layers of managers and bureaucrats simply eat up
money that could be spent on those who actually are doing the work
such as doctors and nurse, and supplies.  The most efficient system
is probably one that has limited management and a fixed budget such
as England's or even Canada's.  I'm afraid we are on the wrong
track.  The problem may be that the insurance lobby is too powerful.




-- 
----------------------------------------------------------------------------
Gordon Banks  



In [320]:
# checking to see if the matrix is empty or not
if X_tf.nnz == 0:
    raise ValueError("Term Frequency matrix is empty. Adjust preprocessing settings.")

# Prepare Term-Document Matrices:

In [333]:
# Displaying Term Frequency Matrix
print("Term Frequency (TF) Matrix:")
feature_names = vectorizer_tf.get_feature_names_out()
tf_df = pd.DataFrame(X_tf[:5].toarray(), columns=["Term: " + term for term in feature_names])  # Show first 5 docs
print(tf_df.loc[:, (tf_df != 0).any(axis=0)])  # Display only non-zero columns

Term Frequency (TF) Matrix:
   Term: 150  Term: 20  Term: ability  Term: actually  Term: also  \
0          0         0              0               1           1   
1          0         0              0               0           0   
2          0         0              0               0           0   
3          0         0              1               0           0   
4          1         1              0               0           0   

   Term: ancient  Term: announced  Term: another  Term: anyone  Term: anyway  \
0              0                0              0             0             1   
1              0                0              0             1             0   
2              0                2              0             0             0   
3              0                0              0             0             0   
4              1                0              1             2             0   

   ...  Term: vlb  Term: wanting  Term: wants  Term: watch  Term: weapons  \

In [324]:
#  (PPMI)
def compute_ppmi(matrix):
    matrix = csr_matrix(matrix)  # Convert to sparse format
    total_count = matrix.sum() + 1e-10  # Avoid division by zero
    word_freqs = np.array(matrix.sum(axis=0)).flatten() + 1e-10
    doc_freqs = np.array(matrix.sum(axis=1)).flatten() + 1e-10  
    
    expected = np.outer(doc_freqs, word_freqs) / total_count + 1e-10  
    observed = matrix.toarray() * total_count + 1e-10  
    
    ppmi_matrix = np.log(observed / expected)  
    ppmi_matrix[np.isneginf(ppmi_matrix)] = 0  
    ppmi_matrix[ppmi_matrix < 0] = 0  
    
    return csr_matrix(ppmi_matrix)  

X_ppmi = compute_ppmi(X_tf)

In [325]:
# Apply SVD for dimensionality reduction
svd_dim = 100  # Number of dimensions to retain

def apply_svd(matrix, n_components=100):
    svd = TruncatedSVD(n_components=n_components)
    reduced_matrix = svd.fit_transform(matrix)
    return reduced_matrix

X_tf_svd = apply_svd(X_tf, svd_dim)
X_tfidf_svd = apply_svd(X_tfidf, svd_dim)
X_ppmi_svd = apply_svd(X_ppmi, svd_dim)


In [326]:
# Print shapes of transformed matrices
print("TF SVD Shape:", X_tf_svd.shape)
print("TF-IDF SVD Shape:", X_tfidf_svd.shape)
print("PPMI SVD Shape:", X_ppmi_svd.shape)

TF SVD Shape: (18846, 100)
TF-IDF SVD Shape: (18846, 100)
PPMI SVD Shape: (18846, 100)
