In [1]:
import numpy as np
import yaml

### Problem 1: Similarity metrics

In [2]:
# loading all the yaml data to a dict
with open("newsgroup_data.yaml", "r") as stream:
    try:
        newsgroup = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
def JaccardSimilarity(x, y):
    # input: dict where key is word i and value is the occurance of the word in the document
    num = 0
    denom = 0
    # find all the nuique keys of X Unione Y
    UniqueWordsinXY = list(set(x.keys()) | set(y.keys()))
    for word in UniqueWordsinXY:
        xi = x.get(word, 0)
        yi = y.get(word, 0)
        num += min(xi, yi)
        denom += max(xi, yi)
    return num / denom


In [4]:


def L2Similarity(x, y):
    UniqueWordsinXY = list(set(x.keys()) | set(y.keys()))
    sumSquare = 0
    for word in UniqueWordsinXY:
        xi = x.get(word, 0)
        yi = y.get(word, 0)
        sumSquare += (abs(xi - yi))**2
    
    return -np.sqrt(sumSquare)


In [5]:
def cosineSimilarity(x, y):
    UniqueWordsinXY = list(set(x.keys()) | set(y.keys()))
    num = 0
    sumXSqr = 0
    sumYSqr = 0
    for word in UniqueWordsinXY:
        xi = x.get(word, 0)
        yi = y.get(word, 0)
        num += xi * yi
        sumXSqr += xi**2
        sumYSqr += yi**2
    return num / (np.sqrt(sumXSqr) * np.sqrt(sumYSqr))



In [44]:
# testing
# t1 = {1:2, 2:14, 5:2, 9:7} # 0.11111
# t2 = {1:2, 4:4, 5:2, 10:7} # -17.606
# JaccardSimilarity(t1,t2)
# L2Similarity(t1,t2)
# cosineSimilarity(t1,t2)

253


0.05886651103648987

In [6]:
# A function to calculate the average simlarity between two groups using the given metric

def avg_similarity(groupA, groupB, metric):
    # input: dict:
    # {0: {3: 1,
    #   10: 1,
    #   12: 8,
    #   17: 1}}
    #   article: {word: # of occurances}
    #   word 3 appears 1 times in article 0
    # output the avg score between two groups
    # each group has 50 articles
    # method: function of your the metric method 
    all_scores = []
    keys_A = list(groupA.keys())
    keys_B = list(groupB.keys())
    for i in range(50):
        for j in range(50):
            article_ai = groupA[keys_A[i]]
            article_bj = groupB[keys_B[j]]
            all_scores.append(metric(article_ai, article_bj))
    return np.mean(all_scores)


In [7]:
def heapmap_matrix(avg_similarity, metric):
    # input: a function of metric
    # return a 20×20 matrix with rows and columns indexed by newsgroups (in the same order)
    matrix = np.zeros((20,20))
    groups = list(newsgroup.keys())
    for i in range(20):
        group_i = newsgroup[groups[i]]
        for j in range(20):
            group_j = newsgroup[groups[j]]
            matrix[i,j] = avg_similarity(group_i, group_j, metric)

    return matrix
        

In [8]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

def makeHeatMap(data, names, color, outputFileName):
    fig, ax = plt.subplots()
    #create the map w/ color bar legend
    heatmap = ax.pcolor(data, cmap=color)
    cbar = plt.colorbar(heatmap)

    # put the major ticks at the middle of each cell
    ax.set_xticks(np.arange(data.shape[0])+0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[1])+0.5, minor=False)

    # want a more natural, table-like display
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    ax.set_xticklabels(range(1, 21))
    ax.set_yticklabels(names)

    plt.tight_layout()

    plt.savefig(outputFileName, format = 'png')
    plt.close()

In [9]:
groups = list(newsgroup.keys())

In [58]:
makeHeatMap(data = heapmap_matrix(avg_similarity, metric = JaccardSimilarity), names = groups, color = cm.Blues, outputFileName = "Jaccard.png")

In [56]:
makeHeatMap(data = heapmap_matrix(avg_similarity, metric = L2Similarity), names = groups, color = cm.Blues, outputFileName = "L2.png")

In [57]:
makeHeatMap(data = heapmap_matrix(avg_similarity, metric = cosineSimilarity), names = groups, color = cm.Blues, outputFileName = "Cosine.png")

### Problem 2: Dimension reduction


(a) [3 points] Baseline classification

Newsgroup dataset: 20 different groups and each groups contains 50 different articles  
Total of 1000 articles

In [10]:
# Classify using Cosine Similarity for all 1000 articles

# classify all 1000 articles
# concat 1000 articles into one dict

### pre-process the data
y = np.repeat(range(20), 50) # labels of the articles
documents = {} # 1000 articles

for gp in groups:
    documents.update(newsgroup[gp])




In [20]:

def baseline_cosine_NN(doc):
    # input: the number of the article
    # for any given document, 
    # finds the document with largest cosine similarity 
    # and returns the corresponding newsgroup label
    scores = {} # key: (article_i) : Cosine_score, The cosine score of article_i and doc
    for art_i in documents:
        # calculate the cosine for every possible articles
        scores.update({art_i : cosineSimilarity(documents[art_i], doc)})

    # find the art_j with the highest score
    article_max = max(scores, key = scores.get)

    # return the index of the classification label
    return y[article_max]
    




    

In [57]:
def baseline_cosine_NN2(doc):
    # input: the number of the article
    # for any given document, 
    # finds the document with largest cosine similarity 
    # and returns the corresponding newsgroup label
    scores = {} # key: (article_i) : Cosine_score, The cosine score of article_i and doc
    filteredDoc = documents.copy()
    filteredDoc.pop(doc)
    for art_i in filteredDoc:
        # calculate the cosine for every possible articles
        scores.update({art_i : cosineSimilarity(filteredDoc[art_i], documents[doc])})

    # find the art_j with the highest score
    article_max = max(scores, key = scores.get)

    # return the index of the classification label
    return y[article_max]

In [44]:

cosineSimilarity(filtered_doc[1], newsgroup['talk.politics.misc'][900])

0.21725017863742738

In [59]:
def predict(group, model = baseline_cosine_NN2):
    # input: the news group
    # return: list: a list of predicted labels of the article for the given group
    labels = []
    # for each article in this group
    # we predict the label of group it belongs to
    for art_num in group:
        labels.append(model(doc = art_num))
    return labels


In [62]:
# 20×20 matrix whose (A,B)
#  entry is defined by the fraction of articles in group A
#  that have their nearest neighbor in group B

# (A,B) of the matrix implies the percentage of 
# labels in group A are classified as group B using Cosine 
def heapmap_classification():
    # input: a function of metric
    # return a 20×20 matrix with rows and columns indexed by newsgroups (in the same order)
    matrix = np.zeros((20,20))
    groups = list(newsgroup.keys())
    for i in range(20):
        group_i = newsgroup[groups[i]]
        pred_group_i = np.array(predict(group_i))
        for j in range(20):        
            # check how many predicted labels in group i is equal to j
            matrix[i,j] = np.mean(pred_group_i == j)
            

    return matrix

In [63]:
matrix_error = heapmap_classification()

In [65]:
makeHeatMap(matrix_error, names = groups, color = cm.Blues, outputFileName = "Cosine__Classification_NN.png")