# Lignin Oligomer Cheminformatics Workflow

This notebook demonstrates database construction, diversity and cluster analysis, property prediction, similarity search, and integrates visualization and network analysis for lignin oligomer datasets.

In [None]:
# Setup
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw, AllChem, DataStructs, rdmolops
import networkx as nx


## Load JSON Data

In [None]:
with open('LigninStructs_3.json') as f:
    data = json.load(f)
df = pd.json_normalize(data['ligninchains'])
# Expand bond counts columns
for key in ['BB', 'BO4', 'B5']:
    df[f'bndCnts.{key}'] = df['bndCnts'].apply(lambda x: x.get(key, 0))
df.head()

## 1. Database Construction (SQLite and pandas DataFrame)

In [None]:
import sqlite3
conn = sqlite3.connect('lignin.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS lignin_oligomers (
    lg_id TEXT PRIMARY KEY,
    smilestring TEXT,
    molWeight REAL,
    BB INTEGER,
    BO4 INTEGER,
    B5 INTEGER,
    branchingFactor REAL,
    evaluatedDP INTEGER
)''')
for _, row in df.iterrows():
    c.execute(
        "INSERT OR REPLACE INTO lignin_oligomers VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
        (
            row['lg_id'], row['smilestring'], row['molWeight'], row['bndCnts.BB'],
            row['bndCnts.BO4'], row['bndCnts.B5'], row['branchingFactor'], row['evaluatedDP']
        )
    )
conn.commit()
conn.close()

## 2. Diversity and Cluster Analysis

In [None]:
# Diversity: Bond patterns and branching
print("Unique bond patterns:")
print(df[['bndCnts.BB', 'bndCnts.BO4', 'bndCnts.B5']].drop_duplicates())
print("
Branching factor distribution:")
print(df['branchingFactor'].value_counts())
df['molWeight'].hist(bins=10)
plt.title('Molecular Weight Distribution')
plt.xlabel('Molecular Weight')
plt.ylabel('Count')
plt.show()

In [None]:
# Cluster analysis: Hierarchical clustering on fingerprints
from sklearn.cluster import AgglomerativeClustering
fps = []
for smi in df['smilestring']:
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    fps.append(arr)
fps = np.array(fps)
clustering = AgglomerativeClustering(n_clusters=2).fit(fps)
df['cluster'] = clustering.labels_
print(df[['lg_id','cluster']])

## 3. Property Prediction (ML Template)

In [None]:
from sklearn.ensemble import RandomForestRegressor
X = df[['bndCnts.BB', 'bndCnts.BO4', 'bndCnts.B5']].fillna(0)
y = df['molWeight']
model = RandomForestRegressor().fit(X, y)
print("Feature importances:", model.feature_importances_)


## 4. Similarity Search (Tanimoto)
Find the most similar oligomers to a query structure.

In [None]:
def tanimoto_search(query_smi, all_smis, topn=3):
    qmol = Chem.MolFromSmiles(query_smi)
    qfp = AllChem.GetMorganFingerprintAsBitVect(qmol, 2, nBits=1024)
    results = []
    for smi in all_smis:
        mol = Chem.MolFromSmiles(smi)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        sim = DataStructs.TanimotoSimilarity(qfp, fp)
        results.append((smi, sim))
    return sorted(results, key=lambda x: -x[1])[:topn]

query = df['smilestring'].iloc[0]
print(tanimoto_search(query, df['smilestring']))

## 5. Atom-Level Visualization and Network Analysis

In [None]:
for i, chain in df.iterrows():
    mol = Chem.MolFromSmiles(chain['smilestring'])
    img = Draw.MolToImage(mol, size=(400, 200))
    display(img)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)
    print(f"{chain['lg_id']}: Nodes={G.number_of_nodes()}, Edges={G.number_of_edges()}, Connected={nx.is_connected(G)}")
    print("Avg degree:", sum(dict(G.degree()).values()) / G.number_of_nodes())
    print("Avg clustering:", nx.average_clustering(G))
    print('-'*40)


## 6. Monomer-Level Network Analysis

In [None]:
def monomer_graph_from_counts(chain):
    dp = int(chain.get('evaluatedDP', 3))
    G = nx.Graph()
    for i in range(dp):
        G.add_node(i+1, label=f"Monomer {i+1}")
    total_bonds = sum([chain.get(f'bndCnts.{b}', 0) for b in ['BB','BO4','B5']])
    added = 0
    for bond in ['BB','BO4','B5']:
        cnt = chain.get(f'bndCnts.{bond}', 0)
        for j in range(cnt):
            if (added+j+1) < dp:
                G.add_edge(added+j+1, added+j+2, label=bond)
        added += cnt
    return G

for i, chain in df.iterrows():
    Gm = monomer_graph_from_counts(chain)
    nx.draw(Gm, with_labels=True, node_color='orange')
    plt.title(f"Oligomer {chain['lg_id']} (Monomer-level)")
    plt.show()
