# NB12 - Explore NetworkX for course information

## 1. Download Dataset

In [None]:
pip install python-louvain

In [None]:
import pandas as pd
import os
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import ast


I used the SQL query below to get the relevent course/programme regulation data from our database. The data was then saved as a CSV file.

```sql
SELECT *
FROM lse_doc
WHERE url ILIKE '%courseGuides%'
   OR url ILIKE '%programmeRegulations%';
```

This filters for urls that contain 'courseGuides' or 'programmeRegulations'. I chose these manually by browsing the lse calendar webpages.


In [None]:
#DATA_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'data')
#This should take about 1 minute
df = pd.read_csv("/Users/jamie/Desktop/chatlse2024/chat-lse/data/data-1726582420261.csv")  

In [None]:
print(df.head())
print(df.columns)
print(df.shape)


## 2. Construct the Graph using NetworkX

### 2.1 Prepare Embeddings
We must first convert the embeddings, currently type string, to numpy arrays.

In [None]:
print(type(df['embedding'][0]))
print(len(df['embedding']))

For the purpose of testing, we use a small subset of the data.

In [None]:
# take only the 2000 rows from the middle of the dataframe
df = df[54000:]

In [None]:
def convert_embedding(embedding_str):
    return np.array(ast.literal_eval(embedding_str))

if isinstance(df['embedding'].iloc[0], str):
    df['embedding_array'] = df['embedding'].apply(convert_embedding)
else:
    df['embedding_array'] = df['embedding']


### 2.2 Construct the Graph

In [None]:
G = nx.Graph()

In [None]:
for idx, row in df.iterrows():
    G.add_node(idx, title=row['title'], url=row['url'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Stack embeddings into a matrix
embedding_matrix = np.vstack(df['embedding_array'].values)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embedding_matrix)


In [None]:
similarity_threshold = 0.8  

In [None]:
import numpy as np

# Get indices where similarity is above the threshold (excluding self-similarity)
indices = np.where((similarity_matrix > similarity_threshold) & (similarity_matrix < 0.99999))

# Add edges to the graph
for i, j in zip(indices[0], indices[1]):
    G.add_edge(i, j, weight=similarity_matrix[i, j])


In [None]:
import community as community_louvain

# Compute the best partition
partition = community_louvain.best_partition(G, weight='weight')

# Add community information to nodes
nx.set_node_attributes(G, partition, 'community')


In [None]:
from collections import defaultdict

# Create a dictionary to hold communities
communities = defaultdict(list)
for node, community_id in partition.items():
    communities[community_id].append(node)


## 3. Observe the Community Structure

In [None]:
for community_id, nodes in communities.items():
    print(f"Community {community_id}: {len(nodes)} nodes")


In [None]:
# Replace with the community ID you want to inspect
community_id_to_inspect = 841

# Get the nodes in the community
nodes_in_community = communities[community_id_to_inspect]


df = df.reset_index(drop=True)


# Extract corresponding data
community_df = df.loc[nodes_in_community]

# Display sample content
print(community_df[['title', 'url']])
