# Get Started

In [15]:
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can use other models like 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Ensure model is in evaluation mode
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
df = pd.read_csv('data.csv', encoding = "cp1252")

In [5]:
df

Unnamed: 0,Entity,Contact,Contact Email,Environment,Purpose,Description,Website,Associated College,Associated School,Parent,Children,Type,Focuses,Engagment Model,Information Verified
0,Advanced Manufacturing Pilot Facility (AMPF),Aaron Stebner,aaron.stebner@gatech.edu,Internal,Provides a platform for developing and demonst...,The Advanced Manufacturing Pilot Facility (AMP...,https://ampf.research.gatech.edu/,,,Georgia Tech Manufacturing Institute,,Research/Industry,Technology and Automation,Research,False
1,Advanced Technology Development Center (ATDC),Alex Rhodeen,,External,Supports startups in scaling their operations ...,The Advanced Technology Development Center (AT...,https://atdc.org/,,,,,External,"Technology and Automation, Supply Chain Planni...",Industry Forum/Consortium,False
2,Artificial Intelligence Institute for Advances...,Pascal Van Hentenryck,,External,Conducts research and development in AI to opt...,This NSF Artificial Intelligence (AI) Research...,https://www.ai4opt.org/,,,,,External,"Technology and Automation, Supply Chain Planni...","Research, Education Innitiaves, Industry Forum...",False
3,Atlanta Regional Commission (ARC),Anna Roach,,External,Coordinates regional planning and development ...,The Atlanta Regional Commission (ARC) is the o...,https://atlantaregional.org/,,,,,External,"Supply Chain Planning, Supply Chain Analytics",Industry Forum/Consortium,False
4,Brook Byers Institute for Sustainable Systems ...,Beril Toktay,beril.toktay@scheller.gatech.edu,Internal,Promotes sustainability through research and c...,The Brook Byers Institute for Sustainable Syst...,https://sustainability.gatech.edu/,,,Enterprise Innovation Institute (EI2),,Research/Industry,Sustainability and Green Logistics,Research,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Metro Atlanta Chamber of Commerce (MACOC),Robert Herrig,rherrig@macoc.com,External,"Represents businesses, colleges, universities,...",The Metro Atlanta Chamber (MAC) is a 165-year-...,https://www.metroatlantachamber.com/,,,,,,,,
76,Partnership for Inclusive Innovation,Rachel Cronin,Rachel.Cronin@innovate.gatech.edu,External,Promotes innovation and prosperity for all Geo...,The Partnership for Inclusive Innovation (PIN)...,https://pingeorgia.org,,,,,,,,
77,Sentient Immersive Response Networks Lab (SIRe...,,,External,Drives collaborative research and training in ...,"On November 15, 2019, IMT Mines Albi and the G...",https://www.imt-mines-albi.fr/en/actus/launchi...,,,,,,,,
78,Smart Sea Level Sensors Project,Russ Clark,russ.clark@gatech.edu,External,Enhance flood monitoring and response in Chath...,The Smart Sea Level Sensors Project aims to in...,https://www.sealevelsensors.org/about/,,,,,,,,


# Embed

In [7]:
def embed_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Pass through BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the embeddings (we'll use the [CLS] token's embedding as a summary)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze().numpy()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Entity                 80 non-null     object
 1   Contact                69 non-null     object
 2   Contact Email          57 non-null     object
 3   Environment            80 non-null     object
 4   Purpose                79 non-null     object
 5   Description            79 non-null     object
 6   Website                80 non-null     object
 7   Associated College     28 non-null     object
 8   Associated School      17 non-null     object
 9   Parent                 12 non-null     object
 10  Children               4 non-null      object
 11  Type                   64 non-null     object
 12  Focuses                64 non-null     object
 13  Engagment Model        13 non-null     object
 14  Information Verified   64 non-null     object
 15  Purpose_Embedding      80

In [9]:
df['Purpose_Embedding'] = df['Purpose'].astype(str).apply(embed_text)
df['Description_Embedding'] = df['Description'].astype(str).apply(embed_text)

                                               Entity  \
0        Advanced Manufacturing Pilot Facility (AMPF)   
1       Advanced Technology Development Center (ATDC)   
2   Artificial Intelligence Institute for Advances...   
3                   Atlanta Regional Commission (ARC)   
4   Brook Byers Institute for Sustainable Systems ...   
..                                                ...   
75          Metro Atlanta Chamber of Commerce (MACOC)   
76               Partnership for Inclusive Innovation   
77  Sentient Immersive Response Networks Lab (SIRe...   
78                    Smart Sea Level Sensors Project   
79             The Logistics Institute – Asia Pacific   

                                              Purpose  \
0   Provides a platform for developing and demonst...   
1   Supports startups in scaling their operations ...   
2   Conducts research and development in AI to opt...   
3   Coordinates regional planning and development ...   
4   Promotes sustainability th

In [11]:
# Display the DataFrame with embeddings
df[['Entity', 'Purpose', 'Purpose_Embedding', 'Description', 'Description_Embedding']]

Unnamed: 0,Entity,Purpose,Purpose_Embedding,Description,Description_Embedding
0,Advanced Manufacturing Pilot Facility (AMPF),Provides a platform for developing and demonst...,"[-0.7475531, -0.28025112, -0.5108066, -0.09811...",The Advanced Manufacturing Pilot Facility (AMP...,"[-0.57778853, -0.2628846, -0.6567326, -0.44782..."
1,Advanced Technology Development Center (ATDC),Supports startups in scaling their operations ...,"[-0.7059361, -0.014833092, -0.519372, -0.11807...",The Advanced Technology Development Center (AT...,"[-0.8001564, -0.41133833, -0.7856436, 0.248099..."
2,Artificial Intelligence Institute for Advances...,Conducts research and development in AI to opt...,"[-0.8444699, 0.042686164, -0.92527217, -0.0121...",This NSF Artificial Intelligence (AI) Research...,"[-0.6534205, -0.22116624, -0.3180426, 0.109655..."
3,Atlanta Regional Commission (ARC),Coordinates regional planning and development ...,"[-0.9401546, -0.2008859, -0.8348193, -0.138634...",The Atlanta Regional Commission (ARC) is the o...,"[-0.5959657, 0.004410204, -0.7242155, -0.17405..."
4,Brook Byers Institute for Sustainable Systems ...,Promotes sustainability through research and c...,"[-0.4857487, 0.15720437, -0.36733863, -0.27873...",The Brook Byers Institute for Sustainable Syst...,"[-0.6693135, -0.14842133, -0.5588188, -0.12038..."
...,...,...,...,...,...
75,Metro Atlanta Chamber of Commerce (MACOC),"Represents businesses, colleges, universities,...","[-0.8119428, -0.16800997, -0.65266424, -0.2421...",The Metro Atlanta Chamber (MAC) is a 165-year-...,"[-0.4659681, -0.43856305, -0.20382541, -0.3299..."
76,Partnership for Inclusive Innovation,Promotes innovation and prosperity for all Geo...,"[-0.806397, -0.08502055, -1.0574926, -0.420601...",The Partnership for Inclusive Innovation (PIN)...,"[-0.5760279, -0.3649536, -0.9441599, -0.242495..."
77,Sentient Immersive Response Networks Lab (SIRe...,Drives collaborative research and training in ...,"[-0.82316875, 0.09686072, -0.31321028, 0.03096...","On November 15, 2019, IMT Mines Albi and the G...","[-0.65241796, -0.13074902, -0.33428797, -0.093..."
78,Smart Sea Level Sensors Project,Enhance flood monitoring and response in Chath...,"[-1.0763615, -0.4297121, -0.7915945, -0.108039...",The Smart Sea Level Sensors Project aims to in...,"[-0.93810964, -0.528288, -0.5730109, -0.102771..."


# Analysis

In [18]:
from sklearn.decomposition import PCA

# Reduce Purpose embeddings to 2D
df['Purpose_Embedding_2D'] = PCA(n_components=2).fit_transform(list(df['Purpose_Embedding']))

# Reduce Description embeddings to 2D
df['Description_Embedding_2D'] = PCA(n_components=2).fit_transform(list(df['Description_Embedding']))

ValueError: Expected a 1D array, got an array with shape (80, 2)

In [12]:
# Cosine similarity matrix for Purpose embeddings
similarity_matrix = cosine_similarity(list(df['Purpose_Embedding']))

In [16]:
# Set threshold for edge creation (e.g., 0.8)
threshold = 0.8
edges = np.where(similarity_matrix > threshold)
edge_list = [(i, j) for i, j in zip(edges[0], edges[1]) if i != j]

In [21]:
import networkx as nx
import plotly.graph_objects as go

G = nx.Graph()
for idx, entity in enumerate(df['Entity']):
    G.add_node(idx, label=entity)

# Add edges based on similarity
G.add_edges_from(edge_list)

# Use NetworkX spring layout (force-directed layout)
pos = nx.spring_layout(G)

# Extract positions for nodes
x_coords = [pos[i][0] for i in range(len(G.nodes))]
y_coords = [pos[i][1] for i in range(len(G.nodes))]

# Plot nodes in Plotly
node_trace = go.Scatter(
    x=x_coords,
    y=y_coords,
    mode='markers+text',
    marker=dict(size=10, color='skyblue'),
    text=[G.nodes[i]['label'] for i in range(len(G.nodes))],
    textposition="bottom center"
)

# Plot edges in Plotly
edge_trace = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace.append(
        go.Scatter(
            x=(x0, x1),
            y=(y0, y1),
            mode='lines',
            line=dict(width=0.5, color='gray')
        )
    )

In [23]:
fig = go.Figure(data=[node_trace] + edge_trace)
fig.update_layout(title="Network Graph of Embeddings (Without PCA)", showlegend=False)

fig.show()