In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from utils import pruning, filtering, similarity, complex_detection, evalution

In [2]:
biogrid_ppi_dataset_path = '/Users/mthorne/git/prorank/datasets/BIOGRID-PUBLICATION-21817-5.0.251.tab3.txt'
fasta_dataset_path = '/Users/mthorne/git/prorank/datasets/uniprot_sprot.fasta'
cyc2008_dataset_path = '/Users/mthorne/git/prorank/datasets/CYC2008.txt'


### 1. Load PPI dataset

In [3]:
df = pd.read_csv(biogrid_ppi_dataset_path,sep='\t')
df = df[['Systematic Name Interactor A', 'Systematic Name Interactor B']]
print(df.head())
edges = df.values.tolist()
G = nx.Graph()
G.add_edges_from(edges)
print(f'Number of edges: {len(G.edges)}\nNumber of nodes: {len(G.nodes)}')

  Systematic Name Interactor A Systematic Name Interactor B
0                      YBR119W                      YPL178W
1                      YBR119W                      YML046W
2                      YBR119W                      YKL012W
3                      YBR119W                      YDR235W
4                      YBR119W                      YIL061C
Number of edges: 6531
Number of nodes: 1430


### 2. Prune PPI graph

In [4]:
pruning.adjustcd_iterative(G, threshold=0.125, max_iter=1)
print(f'Number of edges: {len(G.edges)}\nNumber of nodes: {len(G.nodes)}')

Number of edges: 4191
Number of nodes: 910


### 3. Identify Proteins to Filter

In [5]:
bridge_proteins = filtering.identify_bridge_proteins(G)
fjord_proteins = filtering.identify_fjord_proteins(G)
shore_proteins = filtering.identify_shore_proteins(G)

Found: 13 bridge nodes 
Found: 138 fjord nodes 
Found: 193 shore nodes 


### 4. Calculate Similarity Matrix

In [6]:
# Get Fasta seuqences for proteins in PPI Graph
ppi_fasta_sequences_path = '/Users/mthorne/git/prorank/datasets/ppi_fasta_sequences.fasta'
fasta_output_path = '/Users/mthorne/git/prorank/datasets/fasta_output.txt'
similarity.get_fasta_sequences_for_ppi(G, biogrid_ppi_dataset_path, fasta_dataset_path, ppi_fasta_sequences_path)


Created mapping for 1430 proteins
Loaded 573661 sequences from FASTA

Wrote 910 protein sequences to /Users/mthorne/git/prorank/datasets/ppi_fasta_sequences.fasta
Proteins without sequences: 0


#### Run Fasta algorithm
```
/Users/mthorne/git/prorank/fasta36/bin/fasta36 \
  -E 10000 -b 1430 -d 0 -m 8CB \
  /Users/mthorne/git/prorank/datasets/ppi_fasta_sequences.fasta \
  /Users/mthorne/git/prorank/datasets/ppi_fasta_sequences.fasta \
  > /Users/mthorne/git/prorank/datasets/fasta_output.txt
```

In [7]:
# Parse the Fasta output and create simlarity matrix
G_similarity = similarity.create_similarity_graph(G, fasta_output_path)

Parsed scores for 910 query proteins
Creating similarity matrix for 910 proteins

Filled 828100 similarity scores
Matrix coverage: 100.00%

Non-zero scores: 828100
Score range: 11.0 to 7231.0

Sample of similarity matrix:
         YAL001C  YAL002W  YAL007C  YAL013W  YAL016W
YAL001C   1703.5     15.5     14.5     16.0     17.0
YAL002W     15.5   1897.7     17.0     15.1     16.8
YAL007C     14.5     17.0    312.0     15.3     12.9
YAL013W     16.0     15.1     15.3    613.4     13.8
YAL016W     17.0     16.8     13.2     13.8    921.4
Created directed graph with 910 nodes and 828100 edges


### 5. Pagerank

In [8]:
# Run PageRank algorithm on the directed similarity graph
pagerank_scores = nx.pagerank(G_similarity, weight='weight')

# Convert to DataFrame for easier analysis
pagerank_df = pd.DataFrame.from_dict(pagerank_scores, orient='index', columns=['PageRank'])
pagerank_df = pagerank_df.sort_values('PageRank', ascending=False)

print(f"PageRank scores computed for {len(pagerank_scores)} nodes")
print(f"\nTop 5 proteins by PageRank score:")
print(pagerank_df.head(5))

PageRank scores computed for 910 nodes

Top 5 proteins by PageRank score:
         PageRank
YOL086C  0.001107
YOR063W  0.001107
YOR136W  0.001106
YGR234W  0.001106
YPR103W  0.001106


### 6. Complex Detection

In [9]:
# Concatenate bridge, fjord, and shore proteins into a single array
filtered_proteins = bridge_proteins + fjord_proteins + shore_proteins

# Find complexes using the combined filtered proteins
complexes = complex_detection.find_complexes(G, pagerank_scores, filtered_proteins)

Detected 292 complexes from 881 proteins
Excluded 344 bridge proteins


In [10]:
# Merge similar complexes
merged_complexes = complex_detection.merge_similar_complexes(complexes, similarity_threshold=0.5)

# Display summary
print(f"\nMerged complex size distribution:")
merged_sizes = sorted([len(proteins) for proteins in merged_complexes.values()], reverse=True)
print(f"  Min: {min(merged_sizes)}, Max: {max(merged_sizes)}, Mean: {np.mean(merged_sizes):.1f}, Median: {np.median(merged_sizes):.1f}")
print(f"\nTop 5 largest merged complexes:")
for i, size in enumerate(merged_sizes[:5], 1):
    print(f"  {i}. {size} proteins")

Merged 292 complexes into 292 complexes
Merged 0 complexes

Merged complex size distribution:
  Min: 1, Max: 28, Mean: 3.0, Median: 2.0

Top 5 largest merged complexes:
  1. 28 proteins
  2. 18 proteins
  3. 16 proteins
  4. 14 proteins
  5. 14 proteins


### 7. Evaluation

In [11]:
reference_complexes = evalution.load_reference_complexes(cyc2008_dataset_path)
print(f"Loaded {len(reference_complexes)} reference complexes from CYC2008")


Loaded 410 reference complexes from CYC2008


In [12]:
for i, (key, value) in enumerate(list(merged_complexes.items())[:5]):
    print(f"Complex {key}: {value}")
# Print first few items from reference_complexes
for i, (key, value) in enumerate(list(reference_complexes.items())[:5]):
    print(f"Complex {key}: {value}")
print(type(merged_complexes), type(reference_complexes))


Complex 1: ['YBR142W', 'YDR060W', 'YDR101C', 'YDR496C', 'YER006W', 'YER126C', 'YFL002C', 'YFR001W', 'YFR031C-A', 'YGL099W', 'YGR103W', 'YGR245C', 'YHR052W', 'YKR081C', 'YLL034C', 'YLR074C', 'YLR449W', 'YMR049C', 'YNL061W', 'YNL110C', 'YNR053C', 'YOL077C', 'YOL120C', 'YOR063W', 'YOR206W', 'YPL012W', 'YPL043W', 'YPR016C']
Complex 2: ['YDL055C', 'YDR127W', 'YER095W', 'YFR009W', 'YGL195W', 'YGR234W', 'YKL104C', 'YOR133W', 'YPR010C']
Complex 3: ['YBL041W', 'YDL147W', 'YER094C', 'YFL007W', 'YFR050C', 'YGL011C', 'YGR135W', 'YHR200W', 'YJL001W', 'YMR314W', 'YOL038W', 'YOR362C', 'YPR103W']
Complex 4: ['YEL036C', 'YER017C', 'YGL167C', 'YGR132C', 'YGR231C', 'YJL183W', 'YLR342W', 'YMR089C', 'YPL050C']
Complex 5: ['YML092C', 'YMR308C']
Complex 1: ['YKR068C', 'YML077W', 'YDR108W', 'YGR166W', 'YDR407C', 'YMR218C', 'YBR254C', 'YDR246W', 'YDR472W', 'YOR115C']
Complex 2: ['YNL021W', 'YDR295C', 'YPR179C']
Complex 3: ['YDR448W', 'YOR023C', 'YGR252W', 'YPL254W', 'YDR176W']
Complex 4: ['YJL065C', 'YDR121W',

In [19]:
# Evaluate the merged complexes
print("="*80)
print("EVALUATION RESULTS")
print("="*80)

results = evalution.evaluate_complexes(reference_complexes, merged_complexes, match_threshold=0.9)

print(f"\nReference complexes: {results['num_reference_complexes']}")
print(f"Predicted complexes: {results['num_predicted_complexes']}")
print(f"\nMatched reference complexes: {results['num_matched_reference']} ({results['sensitivity']*100:.1f}%)")
print(f"Matched predicted complexes: {results['num_matched_predicted']} ({results['ppv']*100:.1f}%)")
print(f"\n{'='*80}")
print(f"PERFORMANCE METRICS:")
print(f"{'='*80}")
print(f"Sensitivity (Recall):    {results['sensitivity']:.4f} ({results['sensitivity']*100:.2f}%)")
print(f"PPV (Precision):         {results['ppv']:.4f} ({results['ppv']*100:.2f}%)")
print(f"F1 Score:                {results['f1_score']:.4f} ({results['f1_score']*100:.2f}%)")

print(f"\n{'='*80}")
print(f"Top 10 matches by Jaccard index:")
print(f"{'='*80}")
sorted_matches = sorted(results['best_matches'], key=lambda x: x[2], reverse=True)
for i, (ref_idx, pred_idx, score) in enumerate(sorted_matches[:10], 1):
    ref_size = len(reference_complexes[ref_idx])
    pred_size = len(merged_complexes[pred_idx])
    print(f"{i}. Ref complex {ref_idx} ({ref_size} proteins) ↔ Predicted complex {pred_idx+1} ({pred_size} proteins): Jaccard = {score:.4f}")
    print(reference_complexes[ref_idx])
    print(merged_complexes[pred_idx])

EVALUATION RESULTS

Reference complexes: 410
Predicted complexes: 292

Matched reference complexes: 9 (2.2%)
Matched predicted complexes: 9 (3.1%)

PERFORMANCE METRICS:
Sensitivity (Recall):    0.0220 (2.20%)
PPV (Precision):         0.0308 (3.08%)
F1 Score:                0.0256 (2.56%)

Top 10 matches by Jaccard index:
1. Ref complex 101 (5 proteins) ↔ Predicted complex 54 (5 proteins): Jaccard = 1.0000
['YKL135C', 'YPR029C', 'YPL259C', 'YHL019C', 'YLR170C']
['YHL019C', 'YKL135C', 'YLR170C', 'YPL259C', 'YPR029C']
2. Ref complex 19 (7 proteins) ↔ Predicted complex 60 (7 proteins): Jaccard = 1.0000
['YIL062C', 'YLR370C', 'YKL013C', 'YNR035C', 'YBR234C', 'YDL029W', 'YJR065C']
['YBR234C', 'YDL029W', 'YIL062C', 'YJR065C', 'YKL013C', 'YLR370C', 'YNR035C']
3. Ref complex 161 (3 proteins) ↔ Predicted complex 73 (3 proteins): Jaccard = 1.0000
['YPL001W', 'YEL056W', 'YLL022C']
['YEL056W', 'YLL022C', 'YPL001W']
4. Ref complex 29 (6 proteins) ↔ Predicted complex 74 (6 proteins): Jaccard = 1.0000