# NB12 - Explere NetworkX for course information

## 1. Download Dataset

In [2]:
import pandas as pd
import os
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import ast


I used the SQL query below to get the relevent course/programme regulation data from our database. The data was then saved as a CSV file.

```sql
SELECT *
FROM lse_doc
WHERE url ILIKE '%courseGuides%'
   OR url ILIKE '%programmeRegulations%';
```

This filters for urls that contain 'courseGuides' or 'programmeRegulations'. I chose these manually by browsing the lse calendar webpages.


In [103]:
#DATA_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'data')
#This should take about 1 minute
df = pd.read_csv("/Users/jamie/Desktop/chatlse2024/chat-lse/data/data-1726582420261.csv")  

In [25]:
print(df.head())
print(df.columns)
print(df.shape)


                                      id                            doc_id  \
0  da348735232470b30072b605c6f03280_3339  da348735232470b30072b605c6f03280   
1  da348735232470b30072b605c6f03280_3340  da348735232470b30072b605c6f03280   
2  da348735232470b30072b605c6f03280_3341  da348735232470b30072b605c6f03280   
3  da348735232470b30072b605c6f03280_3344  da348735232470b30072b605c6f03280   
4  da348735232470b30072b605c6f03280_3345  da348735232470b30072b605c6f03280   

   chunk_id type                                                url  \
0      3339  pdf  https://info.lse.ac.uk/staff/divisions/academi...   
1      3340  pdf  https://info.lse.ac.uk/staff/divisions/academi...   
2      3341  pdf  https://info.lse.ac.uk/staff/divisions/academi...   
3      3344  pdf  https://info.lse.ac.uk/staff/divisions/academi...   
4      3345  pdf  https://info.lse.ac.uk/staff/divisions/academi...   

                                title  \
0  CourseGuidesProgrammeRegs22-23.pdf   
1  CourseGuidesProgram

## 2. Construct the Graph using NetworkX

### 2.1 Prepare Embeddings
We must first convert the embeddings, currently type string, to numpy arrays.

In [66]:
print(type(df['embedding'][0]))
print(len(df['embedding']))

<class 'str'>
54842


For the purpose of testing, we use a small subset of the data.

In [104]:
# take only the 2000 rows from the middle of the dataframe
df = df[54000:]

In [105]:
def convert_embedding(embedding_str):
    return np.array(ast.literal_eval(embedding_str))

if isinstance(df['embedding'].iloc[0], str):
    df['embedding_array'] = df['embedding'].apply(convert_embedding)
else:
    df['embedding_array'] = df['embedding']


### 2.2 Construct the Graph

In [106]:
G = nx.Graph()

In [107]:
for idx, row in df.iterrows():
    G.add_node(idx, title=row['title'], url=row['url'])


In [108]:
from sklearn.metrics.pairwise import cosine_similarity

# Stack embeddings into a matrix
embedding_matrix = np.vstack(df['embedding_array'].values)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embedding_matrix)


In [109]:
similarity_threshold = 0.8  

In [110]:
import numpy as np

# Get indices where similarity is above the threshold (excluding self-similarity)
indices = np.where((similarity_matrix > similarity_threshold) & (similarity_matrix < 0.99999))

# Add edges to the graph
for i, j in zip(indices[0], indices[1]):
    G.add_edge(i, j, weight=similarity_matrix[i, j])


In [34]:
pip install python-louvain


Note: you may need to restart the kernel to use updated packages.


In [111]:
import community as community_louvain

# Compute the best partition
partition = community_louvain.best_partition(G, weight='weight')

# Add community information to nodes
nx.set_node_attributes(G, partition, 'community')


In [112]:
from collections import defaultdict

# Create a dictionary to hold communities
communities = defaultdict(list)
for node, community_id in partition.items():
    communities[community_id].append(node)


In [113]:
for community_id, nodes in communities.items():
    print(f"Community {community_id}: {len(nodes)} nodes")


Community 0: 1 nodes
Community 1: 1 nodes
Community 2: 1 nodes
Community 3: 1 nodes
Community 4: 1 nodes
Community 5: 1 nodes
Community 6: 1 nodes
Community 7: 1 nodes
Community 8: 1 nodes
Community 9: 1 nodes
Community 10: 1 nodes
Community 11: 1 nodes
Community 12: 1 nodes
Community 13: 1 nodes
Community 14: 1 nodes
Community 15: 1 nodes
Community 16: 1 nodes
Community 17: 1 nodes
Community 18: 1 nodes
Community 19: 1 nodes
Community 20: 1 nodes
Community 21: 1 nodes
Community 22: 1 nodes
Community 23: 1 nodes
Community 24: 1 nodes
Community 25: 1 nodes
Community 26: 1 nodes
Community 27: 1 nodes
Community 28: 1 nodes
Community 29: 1 nodes
Community 30: 1 nodes
Community 31: 1 nodes
Community 32: 1 nodes
Community 33: 1 nodes
Community 34: 1 nodes
Community 35: 1 nodes
Community 36: 1 nodes
Community 37: 1 nodes
Community 38: 1 nodes
Community 39: 1 nodes
Community 40: 1 nodes
Community 41: 1 nodes
Community 42: 1 nodes
Community 43: 1 nodes
Community 44: 1 nodes
Community 45: 1 node

In [121]:
# Replace with the community ID you want to inspect
community_id_to_inspect = 841

# Get the nodes in the community
nodes_in_community = communities[community_id_to_inspect]


df = df.reset_index(drop=True)



# Extract corresponding data
community_df = df.loc[nodes_in_community]

# Display sample content
print(community_df[['title', 'url']])


KeyError: "None of [Index([54841], dtype='int64')] are in the [index]"