In [225]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

def load_data(filepath):
    '''
    Takes in a string with a path to a CSV file
    returns the data points as a list of dictionaries.
    '''
    data = []
    with open(filepath, mode="r") as file:
        csv_read = csv.DictReader(file)
        for row in csv_read:
            data.append(row)
            
    return data

def calc_features(row):
    '''
    Takes in one row dictionary (one country)
    calculates the corresponding feature vector for that country as specified, 
    returns a NumPy array of shape (9,) with dtype of float64.
    '''
    country_features = [
        float(row['child_mort']),
        float(row['exports']),
        float(row['health']),
        float(row['imports']),
        float(row['income']),
        float(row['inflation']),
        float(row['life_expec']),
        float(row['total_fer']),
        float(row['gdpp'])
    ]
    
    return np.array(country_features, dtype=np.float64)

def hac(features):
    '''
    Performs complete linkage hierarchical agglomerative clustering on the countries
    using the (x1, . . . , x9) feature representation
    returns a NumPy array representing the clustering.
    '''
    num = len(features)
    array = np.zeros((num - 1, 4), dtype=np.float64)

    # Initially, each data point is its own cluster
    clusters = {i: [features[i]] for i in range(num)}
    cluster_sizes = [1] * (2 * num - 1)
    max_cluster_index = num

    for row in range(num - 1):
        mindist = float('inf')
        c1 = 0
        c2 = 0

        # Find the two closest clusters based on complete linkage
        for j in clusters:
            for k in clusters:
                if j < k:
                    # For complete distance we want max distance between points in two clusters
                    distance = max(np.linalg.norm(p1 - p2) for p1 in clusters[j] for p2 in clusters[k])

                    if distance < mindist:
                        mindist = distance
                        c1, c2 = j, k

        # New cluster index and size
        new_cluster = max_cluster_index + row
        total_size = cluster_sizes[c1] + cluster_sizes[c2]
        array[row] = [min(c1, c2), max(c1, c2), mindist, total_size]

        # Merge + update
        clusters[new_cluster] = clusters[c1] + clusters[c2]
        cluster_sizes[new_cluster] = total_size

        # Remove old clusters
        del clusters[c1], clusters[c2]

    return array

def fig_hac(Z, names):
    '''
    Visualizes the hierarchical agglomerative clustering of the countries’ feature
    representation
    returns the basic unorganized plot
    '''
    fig = plt.figure()
    dendrogram(Z, labels=names, leaf_rotation=90)
    plt.tight_layout()
    return fig

def normalize_features(features):
    '''
    Takes a list of feature vectors and computes the normalized values.
    The output should be a list of normalized feature vectors in the same format as the input
    returns a plot that reflects the new calculated vectors 
    '''
    features_array = np.array(features)
    mean = np.mean(features_array, axis=0)
    std = np.std(features_array, axis=0)
    normalized_features = (features_array - mean) / std
    return [np.array(row, dtype=np.float64) for row in normalized_features]