# Part 1 - Text Analysis

## a) Headline Processing

### i) Text extraction

In [1]:
import json
import nltk

# relative file path to dataset
data = 'data/NewsCategoryDataset_2017_2022.json'
parsedList = []
categories = {}
documentDetailList = []

# iteratively parse and append each file to parsedList
with open(data, 'r') as file:
    for line in file:
        record = json.loads(line)
        headline = record.get("headline", "")
        
        shortDescription = record.get("short_description", "")
        parsedList.append(headline + " " + shortDescription)
        category = record.get("category", "")
        
        documentDetails = {
            'link': record.get("link"),
            'headline': headline,
            'category': category,
            'description': shortDescription,
            'authors': record.get("authors"),
            'date': record.get("date")
        }
        documentDetailList.append(documentDetails)

        # Update categories dictionary
        if category not in categories:
            categories[category] = [len(parsedList)-1]
        else:
            categories[category].append(len(parsedList)-1)

### ii) Lexical analysis

In [2]:
tokenList = [nltk.tokenize.word_tokenize(d) for d in parsedList]

In [3]:
alphaList = [[t for t in d if t.isalpha()] for d in tokenList]

In [4]:
lowerList = [[t.lower() for t in d] for d in alphaList]

### iii) Filtering stopwords

In [5]:
stopwords = nltk.corpus.stopwords.words("english")
filtList = [[t for t in d if (not t in stopwords)] for d in lowerList]

### iv) Porter stemmer

In [6]:
stemmer = nltk.stem.PorterStemmer()
finalList = [[stemmer.stem(t) for t in d] for d in filtList]

## b) TF.IDF

In [7]:
import math
from collections import defaultdict
import numpy as np
from scipy.sparse import dok_matrix

def calculateTF(t, d):
    # Calculate Term Frequency (TF)
    # TF = number of occurrences / number of terms in d
    return d.count(t) / len(d)

def calculateIDF(t, inverted_index, total_documents):
    # Calculate Inverse Document Frequency (IDF)
    # IDF = log(1 + N / 1 + df)
    # Slight modification to account for terms which never appear, leading to a divide-by-zero error
    
    # Number of documents containing term (N)
    relevantDocCount = len(inverted_index[t])

    return math.log(1 + total_documents / (1 + relevantDocCount))

def generateInvertedIndex(corpus):
    # Create an inverted index
    invertedIndex = defaultdict(list)
    
    # Populate the inverted index
    for i, document in enumerate(corpus):
        for term in set(document):
            invertedIndex[term].append(i)
    
    return invertedIndex

def generateTDM(corpus, invertedIndex):
    # Extract unique terms
    unique = list(invertedIndex.keys())

    # Initialize a sparse Term-Document matrix
    tdm_tfidf = dok_matrix((len(corpus), len(unique)), dtype=np.float64)

    # Populate the sparse matrix
    for doc_id, document in enumerate(corpus):
        for term_id, term in enumerate(unique):
            tf = calculateTF(term, document)
            idf = calculateIDF(term, invertedIndex, len(corpus))
            tfidf = tf * idf
            tdm_tfidf[doc_id, term_id] = tfidf

    return tdm_tfidf, unique

In [8]:
invertedIndex = generateInvertedIndex(finalList)
matrix, terms = generateTDM(finalList, invertedIndex)

## c) Extract highest n% for each category

### i) Average term weights for each category

In [9]:
import numpy as np

def calculateAverageCategoryTermWeight(tdm_tfidf, categories, unique):
    avgTermWeightDict = {}
    
    for category, indexes in categories.items():
        # list to store average term weights for each term in the category
        avgTermWeightVector = []

        # extract relevant rows from the TF.IDF matrix for the current category
        categoryMatrix = tdm_tfidf[indexes].todense()
        # transpose the matrix to get term-wise data
        categoryMatrix = np.array(categoryMatrix).T

        for i, term in enumerate(unique):
            # extract term weights for the current term
            termWeights = categoryMatrix[i]

            avgTermWeight = np.mean(termWeights)
            avgTermWeightVector.append(avgTermWeight)

        # store the average term weights for the current category
        avgTermWeightDict[category] = avgTermWeightVector

    return avgTermWeightDict

avgCategoryTermWeights = calculateAverageCategoryTermWeight(matrix, categories, terms)

### ii) Get highest weighted n% terms per category

In [10]:
def getTopNTerms(avgTermWeights, unique, n):
    topTermsDict = {}

    for category, termWeightVector in avgTermWeights.items():
        # calculate the threshold weight for the top n% terms
        threshold = np.percentile(termWeightVector, 100 - n)
        #print(f"Category: {category}, Threshold: {threshold}")
        
        if (threshold == 0.0):
            # get the indices of terms that exceed the median weight
            topIndices = [i for i, weight in enumerate(termWeightVector) if weight > 0.0]
        else:
            # get the indices of terms that exceed the threshold weight
            topIndices = [i for i, weight in enumerate(termWeightVector) if weight > threshold]

        topTerms = [(unique[i], termWeightVector[i]) for i in topIndices]

        topTermsDict[category] = topTerms

    return topTermsDict

topCategoryTerms = getTopNTerms(avgCategoryTermWeights, terms, n=1)

### iii) Export category details to JSON

In [11]:
import json

def exportCategoryJSON(categories, topTerms, path):
    categoryDetails = []

    for category in categories:
        # create a dictionary to store category details
        categoryDetail = {
            'category_name': category,
            'documents': categories[category],
            'top_terms': topTerms[category]
        }

        # add the category details to the list
        categoryDetails.append(categoryDetail)

    # export category details as JSON
    with open(path, 'w') as file:
        json.dump(categoryDetails, file, indent=2)

outputPath = 'data/category_details.json'
exportCategoryJSON(categories, topCategoryTerms, outputPath)

## d) k-means Clustering

In [14]:
import math
import random

def dotProduct(v1, v2):
    return v1.multiply(v2).sum()

def sumOfSquares(v):
    return np.sum(v.power(2))

def cosine_similarity(query, document, querySquared, docSquared):
    dp = dotProduct(query, document)

    if querySquared == 0 or docSquared == 0:
        return 0  # Avoid division by zero

    return dp / math.sqrt(querySquared * docSquared)

def precomputeSumOfSquares(data):
    return [sumOfSquares(vector) for vector in data]

def assignClusters(data, centroids, sumOfSqauresVector):
    clusters = {i: [] for i in range(len(centroids))}
    
    for i, vector in enumerate(data):
        maxSimilarity = float('-inf')
        currentCluster = -1

        for j, centroid in enumerate(centroids):
            similarity = cosine_similarity(vector, centroid, sumOfSqauresVector[i], sumOfSqauresVector[j])
            if similarity > maxSimilarity:
                maxSimilarity = similarity
                currentCluster = j

        clusters[currentCluster].append(i)

    return clusters

def updateCentroids(data, clusters):
    centroids = []

    for indices in clusters.values():
        clusterVectors = [data[i] for i in indices]
        centroid = np.sum(clusterVectors, axis=0) / len(clusterVectors)
        centroids.append(centroid.toarray().flatten())

    return centroids

def k_means_clustering(data, k, maxIterations=30):
    # initialize centroids randomly
    centroids = [data[i] for i in random.sample(range(data.shape[0]), k)]

    # precompute squared magnitudes for data vectors
    sumOfSqauresVector = precomputeSumOfSquares(data)

    for i in range(maxIterations):
        # assign vectors to clusters
        clusters = assignClusters(data, centroids, sumOfSqauresVector)

        newCentroids = updateCentroids(data, clusters)

        # check for convergence
        if np.array_equal(newCentroids, centroids):
            break

        centroids = newCentroids
        
        #print(f"{i+1}/{maxIterations}")

    return clusters

k = 5
clusters = k_means_clustering(matrix, k)

1/30
2/30
3/30
4/30
5/30
6/30
7/30
8/30
9/30
10/30
11/30
12/30
13/30
14/30
15/30
16/30
17/30
18/30
19/30
20/30
21/30
22/30
23/30
24/30
25/30
26/30
27/30
28/30
29/30
30/30


## e) Extract highest n% for each cluster

### i) Average term weights for each cluster

In [16]:
import numpy as np

def calculateAverageClusterTermWeight(tdm_tfidf, clusters, unique):
    avgTermWeightDict = {}

    for cluster, indexes in clusters.items():
        # list to store average term weights for each term in the cluster
        avgTermWeightVector = []

        # extract relevant rows from the TF.IDF matrix for the current cluster
        clusterMatrix = dok_matrix((len(indexes), tdm_tfidf.shape[1]), dtype=tdm_tfidf.dtype)
        for i, index in enumerate(indexes):
            clusterMatrix[i, :] = tdm_tfidf[index, :]
            
        # transpose the matrix to get term-wise data
        clusterMatrix = clusterMatrix.transpose()

        for i, term in enumerate(unique):
            # extract term weights for the current term
            termWeights = clusterMatrix.getrow(i).toarray()[0]

            avgTermWeight = np.mean(termWeights)
            avgTermWeightVector.append(avgTermWeight)

        # store the average term weights for the current cluster
        avgTermWeightDict[cluster] = avgTermWeightVector

    return avgTermWeightDict

avgClusterTermWeights = calculateAverageClusterTermWeight(matrix, clusters, terms)

### ii) Get highest n% terms per cluster

In [17]:
topClusterTerms = getTopNTerms(avgClusterTermWeights, terms, n=10)

### iii) Export cluster details to JSON

In [18]:
import json

def exportClusterJSON(clusters, topTerms, path):
    clusterDetails = []

    for cluster in clusters:
        # create a dictionary to store cluster details
        clusterDetail = {
            'cluster_index': cluster,
            'documents': clusters[cluster],
            'top_terms': topTerms[cluster]
        }

        # add the cluster details to the list
        clusterDetails.append(clusterDetail)

    # export cluster details as JSON
    with open(path, 'w') as file:
        json.dump(clusterDetails, file, indent=2)

outputPath = 'data/cluster_details.json'
exportClusterJSON(clusters, topClusterTerms, outputPath)

# \_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

# Part 2 - Web application

## Extract and export top weighted n% headline terms to JSON

In [20]:
import json

def exportTopNPercentTermsJSON(tdm_tfidf, documentList, unique, path, n):
    for i, documentDetails in enumerate(documentList):
        documentVector = tdm_tfidf.getrow(i).toarray()[0]
        
        threshold = np.percentile(documentVector, 100 - n)
        
        if (threshold == 0.0):
            # get the indices of terms that exceed the median weight
            topIndices = [i for i, weight in enumerate(documentVector) if weight > 0.0]
        else:
            # get the indices of terms that exceed the threshold weight
            topIndices = [i for i, weight in enumerate(documentVector) if weight > threshold]
        
        # get the corresponding terms and their weight
        topTerms = [(unique[i], documentVector[i]) for i in topIndices]

        documentDetails['top_n_percent_terms'] = topTerms

    # export document details as JSON
    with open(path, 'w') as file:
        json.dump(documentList, file, indent=2)

outputPath = 'data/headlines.json'
exportTopNPercentTermsJSON(matrix, documentDetailList, terms, outputPath, n=2)

## Instructions

To start the Flask application simply run 'python /path/to/folder/app.py' in your terminal. The data/ directory contains pre-computed JSON files necessary for the web application. If you wish to compute them again with different parameters, run the appropriate cells with the updated parameters and they will write to the data/ directory. 

For the wordcloud charts, d3 along with the d3-cloud plugin was used to create the wordcloud. Due to the varying weights for each grouping (categories/clusters), finding a single constant to multiply to find an appropriate font size required the use of a simple function which checks the weight of a single term. Reference used: https://d3-graph-gallery.com/wordcloud.html.

For the bubble charts, https://webtips.dev/how-to-make-interactive-bubble-charts-in-d3-js was used as a reference during development. Nodes under a certain size tended to obscure the text, so a mouseover function was used to show the name of the node on hover.