In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import re
import torch
from itertools import combinations
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, util
import spacy
from pyvis.network import Network

In [28]:
def load_csv(path):
    """Load CSV robustly with fallback encodings."""
    for enc in ['utf-8', 'latin1', 'ISO-8859-1']:
        try:
            return pd.read_csv(path, encoding=enc, on_bad_lines='skip')
        except Exception as e:
            print(f"⚠️ Failed with {enc}: {e}")
    return pd.read_csv(path, encoding='latin1', on_bad_lines='skip')

# Load all CSVs
tender_df = load_csv("tender.csv")
mom_df    = load_csv("mom.csv")
work_df   = load_csv("duringWork.csv")

# Clean and unify column names
for df in [tender_df, mom_df, work_df]:
    df.columns = [c.strip().lower() for c in df.columns]

# Add missing 'subfolder' column if not present
for df in [tender_df, mom_df, work_df]:
    if 'subfolder' not in df.columns:
        df['subfolder'] = None

# Add dataset identifiers
tender_df["source"] = "Tender"
mom_df["source"]    = "MOM"
work_df["source"]   = "DuringWork"

# Keep only required columns (fill missing automatically)
common_cols = ["filename", "subfolder", "description", "file_type", "source"]
tender_df = tender_df.reindex(columns=common_cols)
mom_df    = mom_df.reindex(columns=common_cols)
work_df   = work_df.reindex(columns=common_cols)

# Merge all datasets
docs = pd.concat([tender_df, mom_df, work_df], ignore_index=True)
docs.dropna(subset=["description"], inplace=True)
docs.reset_index(drop=True, inplace=True)

print(f"✅ Loaded & normalized {len(docs)} total documents.")
print("Columns:", docs.columns.tolist())

# Optional sanity check
display(docs.sample(5))

⚠️ Failed with utf-8: 'utf-8' codec can't decode byte 0x92 in position 113: invalid start byte
✅ Loaded & normalized 259 total documents.
Columns: ['filename', 'subfolder', 'description', 'file_type', 'source']


Unnamed: 0,filename,subfolder,description,file_type,source
1,Amendment 2 to Bidding Documents,Tender Document.zip\Tender Document\Stage I\Am...,This file introduces early modifications to th...,Tender Amendment/Clarification,Tender
42,Thermac_Elec,Thermax Bid & MOMs.zip\Thermax Bid & MOMs\Post...,An Excel sheet containing electrical-related p...,Supporting Document,MOM
153,Daily Progress Report TL FGD KTPS 20.04.23.xlsx,,daily progress report of the date mentioned at...,General Project Document,DuringWork
252,TL DVC FGD WPR 31.03.23.xlsx,,This file is a Weekly Project Report for the F...,Weekly Progress Report,DuringWork
226,Letter no- 157_DVC.pdf,,This letter details major delays in FGD system...,Force Majeure Notice,DuringWork


In [29]:
print("Encoding documents with SentenceTransformer...")
model = SentenceTransformer('all-MiniLM-L6-v2')
docs["embedding"] = docs["description"].apply(lambda x: model.encode(str(x), convert_to_tensor=True))

Encoding documents with SentenceTransformer...


In [30]:
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    """Extract project names, issues, dates, and months."""
    doc = nlp(text)
    entities = set()

    # Named entities (ORG, GPE, DATE, EVENT)
    for ent in doc.ents:
        if ent.label_ in ["ORG", "GPE", "DATE", "EVENT"]:
            entities.add(ent.text.strip())

    # Months
    months = re.findall(r"(January|February|March|April|May|June|July|August|September|October|November|December|Jan'?\d{2,4}|Feb'?\d{2,4}|Mar'?\d{2,4}|Apr'?\d{2,4}|May'?\d{2,4}|Jun'?\d{2,4}|Jul'?\d{2,4}|Aug'?\d{2,4}|Sep'?\d{2,4}|Oct'?\d{2,4}|Nov'?\d{2,4}|Dec'?\d{2,4})", text, re.IGNORECASE)
    entities.update([m.capitalize() for m in months])

    # Issue/condition keywords
    issue_keywords = ["delay", "payment", "shortage", "strike", "rain", "monsoon",
                      "lockdown", "covid", "extension", "hindrance", "force majeure"]
    for kw in issue_keywords:
        if re.search(rf"\b{kw}\b", text, re.IGNORECASE):
            entities.add(kw.capitalize())

    return list(entities)


In [31]:
print("Building Knowledge Graph...")
G = nx.Graph()

# Add document nodes
for _, row in docs.iterrows():
    G.add_node(row["filename"], type=row["source"], desc=row["description"])

# Add extracted entity nodes
print("Extracting entities...")
for _, row in tqdm(docs.iterrows(), total=len(docs)):
    ents = extract_entities(row["description"])
    for ent in ents:
        if not G.has_node(ent):
            G.add_node(ent, type="Entity")
        G.add_edge(row["filename"], ent, label="mentions")

Building Knowledge Graph...
Extracting entities...


  0%|          | 0/259 [00:00<?, ?it/s]

In [32]:
print("Computing semantic similarities...")
threshold = 0.72  # tweak based on dataset size
for i, j in tqdm(list(combinations(range(len(docs)), 2))):
    sim = util.cos_sim(docs.iloc[i]["embedding"], docs.iloc[j]["embedding"]).item()
    if sim > threshold:
        G.add_edge(
            docs.iloc[i]["filename"],
            docs.iloc[j]["filename"],
            label=f"semantic_similarity ({sim:.2f})"
        )


Computing semantic similarities...


  0%|          | 0/33411 [00:00<?, ?it/s]

In [33]:
print("Adding project and temporal links...")

# Expanded project keyword list
project_keywords = [
    "FGD", "Flue Gas Desulphurization", "DVC", "Damodar Valley",
    "Koderma", "KTPS", "Thermax", "BHEL", "Unit", "Package", "Project",
    "Power Station", "Thermal Power", "FGD Package"
]

# Loop through all document descriptions
for _, row in docs.iterrows():
    text = row["description"]

    # Link months as Time nodes ---
    months = re.findall(
        r"(January|February|March|April|May|June|July|August|September|October|November|December)",
        text, re.IGNORECASE
    )
    for m in months:
        m = m.capitalize()
        if not G.has_node(m):
            G.add_node(m, type="Time")
        G.add_edge(row["filename"], m, label="time_related")

    # Link explicit project keywords ---
    for p in project_keywords:
        if re.search(rf"\b{p}\b", text, re.IGNORECASE):
            if not G.has_node(p):
                G.add_node(p, type="Project")
            G.add_edge(row["filename"], p, label="related_to_project")

    #Use NLP to auto-detect project/org names ---
    doc_nlp = nlp(text)
    for ent in doc_nlp.ents:
        if ent.label_ in ["ORG", "FAC", "GPE"]:
            ent_text = ent.text.strip()
            # filter out very short or generic names
            if len(ent_text) > 3 and not ent_text.isdigit():
                if not G.has_node(ent_text):
                    G.add_node(ent_text, type="Project")
                G.add_edge(row["filename"], ent_text, label="related_to_project")


Adding project and temporal links...


In [38]:
print("Adding semantic concept links (Delay, Payment, Shortage, etc.)...")

from sentence_transformers import SentenceTransformer, util

# load model once (you can reuse if already loaded earlier)
model = SentenceTransformer("all-MiniLM-L6-v2")

# define concept themes
concepts = {
    "Delay": "project delay, work hindrance, obstruction, slow progress, stoppage, lockdown, strike, force majeure, covid disruption",
    "Payment Issue": "delayed payment, fund shortage, pending bills, financial stress, cash flow issue, non-payment",
    "Material Shortage": "shortage of sand, cement, aggregates, material supply issue, delay due to materials",
    "Manpower Issue": "lack of workers, labour shortage, gate pass delay, manpower unavailability, staffing issue"
}

# encode concept vectors
concept_embeddings = {c: model.encode(desc, convert_to_tensor=True) for c, desc in concepts.items()}

# compare each document with each concept
for _, row in docs.iterrows():
    desc = str(row["description"])
    emb = model.encode(desc, convert_to_tensor=True)

    for concept, concept_emb in concept_embeddings.items():
        sim = util.cos_sim(concept_emb, emb).item()
        if sim > 0.45:  # threshold for semantic similarity
            if not G.has_node(concept):
                G.add_node(concept, type="Concept")
            G.add_edge(row["filename"], concept, label="semantically_related")

print("✅ Semantic concept linking complete!")


Adding semantic concept links (Delay, Payment, Shortage, etc.)...
✅ Semantic concept linking complete!


In [39]:
print("Total Nodes:", G.number_of_nodes())
print("Total Edges:", G.number_of_edges())

# Count node types
from collections import Counter
types = Counter(nx.get_node_attributes(G, "type").values())
print("Node types:", types)


Total Nodes: 405
Total Edges: 2895
Node types: Counter({'DuringWork': 142, 'Entity': 138, 'MOM': 79, 'Tender': 34, 'Project': 9, 'Concept': 3})


In [40]:
projects = [n for n, d in G.nodes(data=True) if d.get("type") == "Project"]
print("Project nodes:", projects)
if projects:
    print("\nDocuments connected to project:")
    for nbr in G.neighbors(projects[0]):
        print(f" - {nbr} ({G.nodes[nbr]['type']})")


Project nodes: ['Package', 'FGD Package', 'Project', 'Unit', 'Damodar Valley', 'Koderma', 'Power Station', 'Thermal Power', 'Flue Gas Desulphurization']

Documents connected to project:
 - Amendment 2 to Bidding Documents (Tender)
 - AMENDMENT-KTPS (Tender)
 - KTPS-PART A-1 (Tender)
 - KTPS-PART D (Tender)
 - KTPS-PART F (Tender)
 - SECTION VII (Tender)
 - Amendment 8 to Bidding Documents (Tender)
 - Amendment 9 to Bidding Documents (Tender)
 - Stage-II Bidding Documents (Tender)
 - Final Post_Bid_Tech & Comm. MOM_DVC Bulk (MOM)
 - MoM_QA_DVC_Thermac (MOM)
 - Thermac_Main (MOM)
 - Attachment 3K_Annexure 6_Oxidation Blowers_Aerzen Machines (MOM)
 - Attachment 3K_Annexure 17_VBF_Eimco_KCP (MOM)
 - 1_BID FORM (MOM)
 - Attachment 20 (MOM)
 - DVC-01_MPR Oct'2019.pdf (DuringWork)
 - DVC-30_MPR MAY'2022.pdf (DuringWork)
 - DVC-31_MPR JUN'2022.pdf (DuringWork)
 - DVC-32_MPR JUL'2022.pdf (DuringWork)
 - DVC-33_MPR Aug'2022.pdf (DuringWork)
 - DVC-34_MPR Sep'2022.pdf (DuringWork)
 - DVC-35_MPR O

In [41]:
print("Shortest path between 'Damodar Valley' and 'FGD Package':")
try:
    path = nx.shortest_path(G, "Damodar Valley", "FGD Package")
    print(" → ".join(path))
except nx.NetworkXNoPath:
    print("No path found.")


Shortest path between 'Damodar Valley' and 'FGD Package':
Damodar Valley → DVC-01_MPR Oct'2019.pdf → FGD → Amendment 2 to Bidding Documents → FGD Package


In [None]:
entity = "Delay"
if entity in G.nodes:
    print(f"Documents connected to '{entity}':")
    for nbr in G.neighbors(entity):
        print(f" - {nbr} ({G.nodes[nbr]['type']})")
else:
    print(f"'{entity}' not found in graph.")


📘 Documents connected to 'Delay':
 - Lockdown.pdf (DuringWork)
 - PM Letters.pdf (DuringWork)
 - Request for Issue of Form-lll for SIPPL.pdf (DuringWork)
 - 01 - Force Majeure Covid 19.pdf (DuringWork)
 - 02 - Force Majeure Covid 19.pdf (DuringWork)
 - Letter no- 159_DVC.pdf (DuringWork)


In [44]:
semantic_edges = [(u, v, d["label"]) for u, v, d in G.edges(data=True) if "semantic_similarity" in d["label"]]
print(f"Semantic edges count: {len(semantic_edges)}")
print("Sample:", semantic_edges[:5])


Semantic edges count: 1678
Sample: [('Amendment 1 to NIT', 'Amendment 2 to Bidding Documents', 'semantic_similarity (0.80)'), ('Amendment 1 to NIT', 'CLARIFICATION ON BIDDING DOCUMENTS_20122018', 'semantic_similarity (0.74)'), ('Amendment 1 to NIT', 'Amendment 3 to NIT', 'semantic_similarity (0.83)'), ('Amendment 1 to NIT', 'Reply to All Pre Bid Queries (Technical) - Bulk Tender', 'semantic_similarity (0.74)'), ('Amendment 1 to NIT', 'Amendment No. 5', 'semantic_similarity (0.83)')]


In [45]:
components = list(nx.connected_components(G))
print(f"Connected Components: {len(components)}")
largest = max(components, key=len)
print(f"Largest Component Size: {len(largest)} nodes")


Connected Components: 29
Largest Component Size: 355 nodes


In [47]:
print("Rendering interactive Knowledge Graph...")

net = Network(height="750px", width="100%", notebook=True,
              bgcolor="#ffffff", font_color="black", directed=False)
net.barnes_hut()

color_map = {
    "Tender": "#87CEEB",      # blue
    "DuringWork": "#90EE90",  # green
    "MOM": "#FFD700",         # yellow
    "Entity": "#FF7F7F",      # red
    "Project": "#FFA500",     # orange
    "Time": "#9370DB"         # purple
}

for node, data in G.nodes(data=True):
    color = color_map.get(data.get("type", ""), "#D3D3D3")
    net.add_node(node, label=node, color=color, title=data.get("desc", ""))

for src, dst, data in G.edges(data=True):
    net.add_edge(src, dst, title=data.get("label", ""))

net.show("knowledge_graph.html")
print("✅ Graph saved as knowledge_graph.html — open it in browser.")

Rendering interactive Knowledge Graph...
knowledge_graph.html
✅ Graph saved as knowledge_graph.html — open it in browser.


In [48]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# store embeddings for retrieval
doc_embeddings = []
for _, row in docs.iterrows():
    emb = model.encode(str(row["description"]), convert_to_tensor=True)
    doc_embeddings.append((row["filename"], emb, row["source"]))

print(f"Stored {len(doc_embeddings)} document embeddings.")

Stored 259 document embeddings.


In [49]:
def semantic_retrieve(query, top_k=5):
    q_emb = model.encode(query, convert_to_tensor=True)
    results = []
    for name, emb, source in doc_embeddings:
        sim = util.cos_sim(q_emb, emb).item()
        results.append((name, sim, source))
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]


In [50]:
semantic_retrieve("reasons for work delay in August", top_k=5)


[('Letter no- 160_DVC.pdf', 0.42702797055244446, 'DuringWork'),
 ('Delay in Issuing of Gate-Passes of Workers.pdf',
  0.4032321572303772,
  'DuringWork'),
 ('Letter no- 157_DVC.pdf', 0.39488762617111206, 'DuringWork'),
 ('Regarding Delay in Vendor Approval Samal Infra Projects Pvt. Ltd..pdf',
  0.3912941813468933,
  'DuringWork'),
 ('01 - Force Majeure Covid 19.pdf', 0.38478004932403564, 'DuringWork')]

In [57]:
nx.write_graphml(G, "knowledge_graph.graphml")

print("✅ Knowledge graph exported to 'knowledge_graph.graphml'")

✅ Knowledge graph exported to 'knowledge_graph.graphml'
