In [None]:
from pathlib import Path

import networkx as nx
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Custom config class to globally change some parameters
class Config:
    run_tests: bool
    do_print: bool

Config.run_tests = True
Config.do_print = True

# Please Run All cells now before reading on

## For peer-reviewing the report, scroll down to the H1 header "Start of Report"

You can also collapse the headers **Ingest data** and **Function implementations** to get there quickly

# Function implementations

The functions below are for reference only. The WebLab functions are to be graded. These may have changed to accomodate our pipelines.

## Spectral embeddings
### Provided design specification
Implement the computation of [spectral embeddings](https://en.wikipedia.org/wiki/Spectral_clustering#Algorithms) for **undirected graphs**.

- The function to implement is `compute_spectral_embeddings`.
- The input is an undirected [networkx.Graph](https://networkx.org/documentation/stable/reference/classes/graph.html).
- The node IDs in this graph are integers from `0` to `n`.
- The output should be a numpy array of shape `(num_nodes, dim)`. Each row corresponds to a node representation. Rows should be ordered by node ID (ascending).
- The Laplacian should not be normalized.
- You have to use [numpy.linalg.eigh](https://numpy.org/doc/stable/reference/generated/numpy.linalg.eigh.html) to compute eigenvalues and eigenvectors in order for the tests to work.
- This task focuses on computing the node representations. You do not need to run an actual clustering algorithm (e.g., k-means).


In [None]:
# Spectral embeddings implementation and tests

def compute_spectral_embeddings(graph: nx.Graph, dim: int) -> np.ndarray:
    """Perform spectral clustering on the graph and compute low-dimensional node representations.
    Does not normalize the Laplacian.

    Args:
        graph (nx.Graph): The graph.
        dim (int): The dimension of representations. This corresponds to the number of eigenvectors used.

    Returns:
        np.ndarray: Node representations (sorted by node ID, ascending), shape (num_nodes, dim).
    """
    adjacency_matrix = nx.to_numpy_array(graph, nodelist=sorted(graph.nodes))

    # make sure the matrix is symmetric
    assert (adjacency_matrix == adjacency_matrix.T).all()

    result = None

    # START ANSWER

    # END ANSWER

    return result

if Config.run_tests:
    import unittest
    
    class TestComponents(unittest.TestCase):
        def setUp(self):
            self.simple_graph = nx.Graph(
                [(0, 3), (1, 3), (2, 4), (3, 5), (3, 6), (4, 6), (5, 6), (5, 4)]
            )

        def test_spectral_embeddings(self):
            emb = compute_spectral_embeddings(self.simple_graph, 3)
            np.testing.assert_almost_equal(
                np.array(
                    [
                        [3.77964473e-01, -4.49723806e-01, 7.07106781e-01],
                        [3.77964473e-01, -4.49723806e-01, -7.07106781e-01],
                        [3.77964473e-01, 6.59857436e-01, -1.66533454e-15],
                        [3.77964473e-01, -2.18583490e-01, 1.11022302e-16],
                        [3.77964473e-01, 3.20716714e-01, 6.59900940e-16],
                        [3.77964473e-01, 6.87284763e-02, 1.14927529e-15],
                        [3.77964473e-01, 6.87284763e-02, 1.26029760e-15],
                    ]
                ),
                emb,
            )

    unittest.main(argv=[''], verbosity=2, exit=False)

## Random walks implementation
### Provided design specification
Implement [random walks](https://en.wikipedia.org/wiki/Random_walk). You can assume that the graph does not have weighted edges.

The function to implement is `random_walks`.
The input is an undirected [networkx.Graph](https://networkx.org/documentation/stable/reference/classes/graph.html).
The output should be a numpy array, shape `(num_nodes * num_walks, walk_length)`.

In [None]:
# Random walks implementation and tests
def random_walks(graph: nx.Graph, num_walks: int, walk_length: int) -> np.ndarray:
    """Perform random walks on an unweighted graph.

    Args:
        graph (nx.Graph): The graph.
        num_walks (int): The number of random walks for each node.
        walk_length (int): The number of nodes in a random walk.

    Returns:
        np.ndarray: The random walks, shape (n_nodes * num_walks, walk_length)
    """
    result = []

    # START ANSWER
    # TODO Change to np.zeros later
    result = np.full((len(graph)*num_walks, walk_length), -1)
    # randomizer = np.random.default_rng()

    def random_walk(graph: nx.Graph, starting_node: int, walk_length: int) -> np.ndarray:
        walk = np.zeros(walk_length)
        walk[0] = starting_node
        current_node = starting_node
        for i in range(walk_length - 1):
            neighbors = list(graph.adj[current_node])
            current_node = np.random.choice(neighbors)
            walk[i + 1] = current_node
        return walk
            
    for node in graph.nodes:
        for i in range(num_walks):
            walk = random_walk(graph, node, walk_length)
            # print(type(node))
            # print(node)
            index = num_walks*node + i
            # print(f"node={node}, index={index}")
            result[index, :] = walk
            print(result)
    # END ANSWER

    return np.array(result)

if Config.run_tests:
    import unittest

    class TestComponents(unittest.TestCase):
        def setUp(self):
            self.simple_graph = nx.Graph(
                [(0, 3), (1, 3), (2, 4), (3, 5), (3, 6), (4, 6), (5, 6), (5, 4)]
            )

        def test_random_walks(self):
            result = random_walks(self.simple_graph, 2, 5)
            self.assertEqual((7 * 2, 5), result.shape)
            for n in result.flatten():
                self.assertIn(n, self.simple_graph.nodes)

    unittest.main(argv=[''], verbosity=2, exit=False)

# Start of Pipeline

In [None]:
def read_data(dir_name: str, file_name: str):
    """Read the medium articles with lists

    Args:
        dir_name (str): Root directory of the medium title files and lists.

    Returns:
        final_data: merged dataframes with articles and lists
    """

    final_data = pd.read_csv(dir_name+"/"+file_name+".csv")
    return final_data

In [None]:
final_data = pd.read_csv("data/pipeline_assignment_data/full_data_without_labels.csv")
final_data

In [None]:
train = read_data("data/pipeline_assignment_data","train")
train

In [None]:
test = read_data("data/pipeline_assignment_data","test")
test

In [None]:
train["labels"].value_counts()

In [None]:
test["labels"].value_counts()

In [None]:
train.subtitle

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

multilabel_binarizer = LabelEncoder()
multilabel_binarizer.fit(train["labels"])

Y = multilabel_binarizer.transform(train["labels"])
texts = [x[0]+" " + x[1] for x in zip(train.title,train.subtitle)]

In [None]:
len(list(set(final_data.index)))

In [None]:
from gensim.models import Word2Vec
all_texts = [x[0]+" " + x[1] for x in zip(final_data.title,final_data.subtitle)]

word2vec_model = Word2Vec([text.split(" ") for text in all_texts], vector_size=128, window=10, epochs=30, sg=1, workers=4,min_count=1)

In [None]:
Y

In [None]:
from sklearn.svm import SVC
embeddings = []

for text in texts:
    embeddings.append(np.mean([word2vec_model.wv[word] for word in text.split(" ")], axis=0))
print(np.vstack(embeddings).shape)
X_word2vec = np.vstack(embeddings)


In [None]:
svc = SVC()
svc.fit(X_word2vec,Y)

In [None]:
test_embeddings = []
text_texts = [x[0]+" " + x[1] for x in zip(test.title,test.subtitle)]

for text in text_texts:
    test_embeddings.append(np.mean([word2vec_model.wv[word] for word in text.split(" ")], axis=0))
print(np.vstack(test_embeddings).shape)
X_word2vec_test = np.vstack(test_embeddings)


In [None]:
predictions = svc.predict(X_word2vec_test)

In [None]:
Y_test = multilabel_binarizer.transform(test["labels"])

Y_test

In [None]:
from sklearn import metrics
print(metrics.f1_score(Y_test, predictions,average="macro"))


In [None]:
print(metrics.classification_report(Y_test, predictions))


# Form graph

In [None]:


from typing import List,Dict


def get_edges(data: pd.DataFrame, nodes) -> List:
    """ Given the dataframe with articles and lists return the set of edges
        Args:
        data (pd.DataFrame): The medium dataset
        nodes (dict): List of nodes
    Returns:
            nodes: dict (nodeid: article title)"""
    edges = []    
    ## START
    
    ##END
    return edges




In [None]:
def get_nodes(data: pd.DataFrame) -> Dict:
    """ Given the dataframe with articles and lists return the set of nodes
        Args:
        data (pd.DataFrame): The medium dataset
    Returns:
        nodes: dict (nodeid: article title)"""
    nodes = {}
    for index, row in data.iterrows():
        if index not in nodes:
            nodes[index] = row["title"]
    print(len(nodes))
    return nodes

In [None]:
import networkx as nx
def form_graph(data: pd.DataFrame) -> nx.Graph:
    """Forms graph from medium article dataset.

    Args:
        data (pd.DataFrame): The medium dataset

    Returns:
        G (nx.Graph): The graph.

       """
    texts = [x[0]+" " + x[1] for x in zip(data.title,data.subtitle)]
    nodes = get_nodes(data)
    edges = get_edges(data, nodes)
    graph = nx.Graph()
    graph.add_nodes_from(list(nodes.keys()))
    graph.add_edges_from(edges)
    return graph


In [None]:
graph = form_graph(final_data)


In [None]:
len(list(graph.edges()))

# Node2Vec

In [None]:
# Find isolated nodes with no neighbors before random walks
isolated = [x for x in nx.isolates(graph)]

In [None]:
def random_walks(G: nx.Graph, num_walks: int, walk_length: int, isolated: List) -> np.ndarray:
    """Perform random walks on the graph.

    Args:
        G (nx.Graph): The graph.
        num_walks (int): The number of random walks for each node.
        walk_length (int): The number of nodes in a random walk.

    Returns:
        np.ndarray: The random walks, shape (n_nodes * num_walks, walk_length)
    """
    result = []
    ### START
    
    ## END
    return np.asarray(result)

In [None]:
walks = random_walks(graph, 8, 15,isolated)

In [None]:
from gensim.models import Word2Vec
def fit_node2vec(walks: np.ndarray, vector_size: int, window: int, epochs: int) -> Word2Vec:
    """Train a Node2Vec model on random walks. Uses the GenSim Word2Vec implementation.

    Args:
        walks (np.ndarray): The random walks.
        vector_size (int): Node representation size.
        window (int): Window width.
        epochs (int): Number of epochs.

    Returns:
        Word2Vec: The trained model.
    """
   ### START

In [None]:
model = fit_node2vec(walks, 128, 5, 10)

In [None]:
#embeddings = {doc: model.dv[doc] for doc in model.dv.index_to_key}


In [None]:
def get_nodeids(data: pd.DataFrame):
    nodes = {}
    for index, row in data.iterrows():
        if row["index"] not in nodes:
            nodes[row["index"]] = row["title"]
    return nodes

In [None]:
train_nodes = get_nodeids(train)
test_nodes = get_nodeids(test)
train_nodes = list(train_nodes.keys())
test_nodes = list(test_nodes.keys())

In [None]:
len(test_nodes)

In [None]:
embeddings = {word: model.wv[word] for word in model.wv.index_to_key}


In [None]:
import scipy
X_train_n2v = np.hstack(
    (
        X_word2vec, np.array([embeddings[str(x)] if x not in isolated else X_word2vec[idx] for idx, x in enumerate(train_nodes)  ], dtype=np.float32),

    )
)
X_test_n2v =np.hstack(
    (
        X_word2vec_test,
        np.array([embeddings[str(x)] if x not in isolated else X_word2vec_test[idx] for idx, x in enumerate(test_nodes)  ], dtype=np.float32),
    )
)

In [None]:
X_test_n2v.shape,X_train_n2v.shape

In [None]:
X_train_n2v.shape

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC

svc = SVC()
svc.fit(X_train_n2v,Y)

In [None]:
Y.shape

In [None]:
X_test_n2v.shape

In [None]:
predictions = svc.predict(X_test_n2v)

In [None]:
predictions

In [None]:
from sklearn import metrics
print(metrics.f1_score(Y_test, predictions,average="macro"))


In [None]:
print(metrics.classification_report(Y_test, predictions))


# Only node2vec features

In [None]:
import scipy
X_train_n2v =  np.array([embeddings[str(x)] if x not in isolated else np.zeros((128)) for idx, x in enumerate(train_nodes)  ], dtype=np.float32)

X_test_n2v =np.array([embeddings[str(x)] if x not in isolated else np.zeros((128)) for idx, x in enumerate(test_nodes)  ], dtype=np.float32)
 

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC

svc = SVC()
svc.fit(X_train_n2v,Y)

In [None]:
predictions = svc.predict(X_test_n2v)

In [None]:
from sklearn import metrics
print(metrics.f1_score(Y_test, predictions,average="macro"))


In [None]:
print(metrics.classification_report(Y_test, predictions))
