# Import Libraries 

In [51]:
# Library imports 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from Bio.PDB import PDBList, PDBParser
import py3Dmol 
import requests 
import os
from scipy.stats import linregress

## Fetch PDB From AlphaFold

In [52]:
def fetch_pdb_from_alphafold(uniprot_ids,folder_name):

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    pdb_dict = {}  # Dictionary to store UniProt ID and PDB content
    
    for uniprot_id in uniprot_ids:
        try:
            # AlphaFold API URL to get the prediction model
            url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"

            # Send GET request to the AlphaFold API
            response = requests.get(url)

            # Check if the request was successful
            if response.status_code == 200:
                data = response.json()  # Parse JSON data
                # Extract the PDB URL from the response
                pdb_url = data[0]['pdbUrl']
                print(f"AlphaFold predicted model PDB URL for {uniprot_id}: {pdb_url}")

                # Download the PDB file content
                pdb_response = requests.get(pdb_url)
                pdb_data = pdb_response.text  # Get the PDB data as text

                #Store PDB data in dictionary
                pdb_dict[uniprot_id] = pdb_data

                # Save the PDB data to a file
                pdb_filename = os.path.join(folder_name, f'{uniprot_id}.pdb')
                with open(pdb_filename, 'w') as pdb_file:
                    pdb_file.write(pdb_data)
                print(f"PDB file saved as {pdb_filename}")

            else:
                print(f"Failed to retrieve data for UniProt ID {uniprot_id}. Status code: {response.status_code}")
        
        except Exception as e:
            print(f"An error occurred for UniProt ID {uniprot_id}: {e}")


### Import Enzymes and Transcript Factors with their UniProt_ID

In [53]:
file_path = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/Protein Complexity - Enzymes_Protogene.csv"

# Read 100 Enzymes
enzymes = pd.read_csv(file_path)

file_path_2 = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/Protein Complexity - Transcription Factor_Protogene.csv"

# Read 100 Transcript Factors
transcript_factor = pd.read_csv(file_path_2)


### Display view Of Enzymes

In [54]:
#View Enzymes
enzymes

Unnamed: 0,Enzyme Name,UniProt ID
0,Alcohol dehydrogenase,Q8W3K0
1,Glucose oxidase,P13006
2,Xanthine oxidase,P47989
3,Lactate dehydrogenase,P00342
4,Malate dehydrogenase,P40925
...,...,...
95,Aldehyde dehydrogenase,P00352
96,Fructose-bisphosphatase,P09083
97,Hydroxyglutaryl-CoA synthase,P54867
98,Glutamate synthase,P32245


### Overview Transcriptor Factors

In [55]:
#View TF
transcript_factor


Unnamed: 0,Transcription Factor Name,UniProt ID
0,p53,P04637
1,RUNX1,P15063
2,SP1,P08047
3,TWIST1,Q15672
4,ESR1,P03372
...,...,...
95,IRF1,P10914
96,REL,Q04864
97,SRF,P11831
98,NR4A1,P22736


# Merge my finds with Outcomes(90 Proteins)

In [None]:
file_path_3 = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/File_enzyme.csv"
file_path_4 = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/File_TF.csv"


# Read 90 Enzymes
enzymes_90 = pd.read_csv(file_path_3)

#Read 90 Transcriptor Factor
tf_90 = pd.read_csv(file_path_4)


### Fecth PDB For Enzymes 

In [56]:
# Fetch and visualize structures for all the given UniProt IDs

enzymes_pdb = fetch_pdb_from_alphafold(enzymes['UniProt ID'],'enzymes_pdb')

AlphaFold predicted model PDB URL for Q8W3K0: https://alphafold.ebi.ac.uk/files/AF-Q8W3K0-F1-model_v4.pdb
PDB file saved as enzymes_pdb/Q8W3K0.pdb
AlphaFold predicted model PDB URL for P13006: https://alphafold.ebi.ac.uk/files/AF-P13006-F1-model_v4.pdb
PDB file saved as enzymes_pdb/P13006.pdb
AlphaFold predicted model PDB URL for P47989: https://alphafold.ebi.ac.uk/files/AF-P47989-F1-model_v4.pdb
PDB file saved as enzymes_pdb/P47989.pdb
AlphaFold predicted model PDB URL for P00342: https://alphafold.ebi.ac.uk/files/AF-P00342-F1-model_v4.pdb
PDB file saved as enzymes_pdb/P00342.pdb
AlphaFold predicted model PDB URL for P40925: https://alphafold.ebi.ac.uk/files/AF-P40925-F1-model_v4.pdb
PDB file saved as enzymes_pdb/P40925.pdb
AlphaFold predicted model PDB URL for P08200: https://alphafold.ebi.ac.uk/files/AF-P08200-F1-model_v4.pdb
PDB file saved as enzymes_pdb/P08200.pdb
AlphaFold predicted model PDB URL for P04035: https://alphafold.ebi.ac.uk/files/AF-P04035-F1-model_v4.pdb
PDB file sav

In [86]:
# Fetch and visualize structures for all the given UniProt IDs

new_enzymes_pdb = fetch_pdb_from_alphafold(enzymes_90['Enzyme_ID'],'new_enzymes_pdb')

AlphaFold predicted model PDB URL for  P00813 : https://alphafold.ebi.ac.uk/files/AF-P00813-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P00813 .pdb
AlphaFold predicted model PDB URL for  P00439 : https://alphafold.ebi.ac.uk/files/AF-P00439-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P00439 .pdb
AlphaFold predicted model PDB URL for  P34897 : https://alphafold.ebi.ac.uk/files/AF-P34897-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P34897 .pdb
AlphaFold predicted model PDB URL for  P21912 : https://alphafold.ebi.ac.uk/files/AF-P21912-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P21912 .pdb
AlphaFold predicted model PDB URL for  P47989 : https://alphafold.ebi.ac.uk/files/AF-P47989-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P47989 .pdb
AlphaFold predicted model PDB URL for  P50336 : https://alphafold.ebi.ac.uk/files/AF-P50336-F1-model_v4.pdb
PDB file saved as new_enzymes_pdb/ P50336 .pdb
AlphaFold predicted model PDB URL for  P53004 : https://alphafold.ebi.

### Fetch PDB for TF

In [57]:
# Fetch and visualize structures for all the given UniProt IDs

tf_pdb = fetch_pdb_from_alphafold(transcript_factor['UniProt ID'],'TFs_pdb')

AlphaFold predicted model PDB URL for P04637: https://alphafold.ebi.ac.uk/files/AF-P04637-F1-model_v4.pdb
PDB file saved as TFs_pdb/P04637.pdb
AlphaFold predicted model PDB URL for P15063: https://alphafold.ebi.ac.uk/files/AF-P15063-F1-model_v4.pdb
PDB file saved as TFs_pdb/P15063.pdb
AlphaFold predicted model PDB URL for P08047: https://alphafold.ebi.ac.uk/files/AF-P08047-F1-model_v4.pdb
PDB file saved as TFs_pdb/P08047.pdb
AlphaFold predicted model PDB URL for Q15672: https://alphafold.ebi.ac.uk/files/AF-Q15672-F1-model_v4.pdb
PDB file saved as TFs_pdb/Q15672.pdb
AlphaFold predicted model PDB URL for P03372: https://alphafold.ebi.ac.uk/files/AF-P03372-F1-model_v4.pdb
PDB file saved as TFs_pdb/P03372.pdb
AlphaFold predicted model PDB URL for P15976: https://alphafold.ebi.ac.uk/files/AF-P15976-F1-model_v4.pdb
PDB file saved as TFs_pdb/P15976.pdb
AlphaFold predicted model PDB URL for P43694: https://alphafold.ebi.ac.uk/files/AF-P43694-F1-model_v4.pdb
PDB file saved as TFs_pdb/P43694.pdb

In [85]:
# Fetch and visualize structures for all the given UniProt IDs

new_tf_pdb = fetch_pdb_from_alphafold(tf_90['TF_ID'],'new_TFs_pdb')

AlphaFold predicted model PDB URL for  P22736 : https://alphafold.ebi.ac.uk/files/AF-P22736-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ P22736 .pdb
AlphaFold predicted model PDB URL for  Q9H334 : https://alphafold.ebi.ac.uk/files/AF-Q9H334-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ Q9H334 .pdb
AlphaFold predicted model PDB URL for  P48436 : https://alphafold.ebi.ac.uk/files/AF-P48436-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ P48436 .pdb
AlphaFold predicted model PDB URL for  P28347 : https://alphafold.ebi.ac.uk/files/AF-P28347-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ P28347 .pdb
AlphaFold predicted model PDB URL for  Q06330 : https://alphafold.ebi.ac.uk/files/AF-Q06330-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ Q06330 .pdb
AlphaFold predicted model PDB URL for  P03372 : https://alphafold.ebi.ac.uk/files/AF-P03372-F1-model_v4.pdb
PDB file saved as new_TFs_pdb/ P03372 .pdb
AlphaFold predicted model PDB URL for  P15036 : https://alphafold.ebi.ac.uk/files/AF-P15036-F1

### Calculate Fractal Dimension

In [58]:
# Step 1: Parse the PDB file and extract atom coordinates
def extract_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)

    atom_coordinates = []
    for atom in structure.get_atoms():
        atom_coordinates.append(atom.get_coord())

    return np.array(atom_coordinates)

# Step 2: Calculate the bounding box for the protein structure
def calculate_bounding_box(coords):
    min_coords = coords.min(axis=0)
    max_coords = coords.max(axis=0)
    return min_coords, max_coords

# Step 3: Box counting function
def box_counting(coords, min_coords, max_coords, grid_size):
    n_boxes = np.ceil((max_coords - min_coords) / grid_size).astype(int)
    occupied_boxes = set()

    # Assign atoms to grid cells (boxes)
    for coord in coords:
        box_idx = tuple(((coord - min_coords) // grid_size).astype(int))
        occupied_boxes.add(box_idx)

    return len(occupied_boxes)

# Step 4: Perform box counting over different grid sizes
def perform_box_counting(coords, min_coords, max_coords):
    # Calculate the largest bounding box dimension (max_length)
    max_length = np.max(max_coords - min_coords)  # Get the largest bounding box dimension

    # Generate grid sizes from max_length/100 to max_length/2, with 15 grid sizes
    grid_sizes = np.logspace(np.log10(max_length/100), np.log10(max_length/2), num=15)

    occupied_boxes_per_grid = []
    for grid_size in grid_sizes:
        occupied_boxes = box_counting(coords, min_coords, max_coords, grid_size)
        occupied_boxes_per_grid.append(occupied_boxes)
    
    return grid_sizes, occupied_boxes_per_grid

# Step 5: Calculate and plot fractal dimension
def calculate_fractal_dimension(grid_sizes, occupied_boxes_per_grid):
    log_occupied_boxes = np.log(occupied_boxes_per_grid)
    log_inverse_grid_size = np.log(1 / grid_sizes)

    # Perform linear regression on log-log data to find the slope (fractal dimension)
    slope, intercept, r_value, p_value, std_err = linregress(log_inverse_grid_size, log_occupied_boxes)
    fractal_dimension = slope
    print(f"Estimated Fractal Dimension: {fractal_dimension:.4f}")
    return fractal_dimension

# Step 6: Main function to estimate the fractal dimension of a protein structure
def estimate_fractal_dimension(pdb_file):
    # Extract coordinates
    coords = extract_coordinates(pdb_file)

    # Calculate bounding box
    min_coords, max_coords = calculate_bounding_box(coords)

    # Perform box counting
    grid_sizes, occupied_boxes_per_grid = perform_box_counting(coords, min_coords, max_coords)

    # Calculate and plot fractal dimension
    fractal_dimension = calculate_fractal_dimension(grid_sizes, occupied_boxes_per_grid)

    return fractal_dimension

## Fractal Dimension For all Enzymes

In [59]:

def process_pdb_files(folder_name):

    # Initialize an empty list to store enzyme names and fractal dimensions
    data = []

    # Loop over all PDB files in the folder
    for pdb_file in os.listdir(folder_name):
        if pdb_file.endswith(".pdb"):
            enzyme_name = pdb_file.split('.')[0]  # Extract enzyme name from the file name
            
            # Construct the full path to the PDB file
            pdb_path = os.path.join(folder_name, pdb_file)

            try:
                # Call your custom function to estimate fractal dimension for the pdb file
                fractal_dimension = estimate_fractal_dimension(pdb_path)
                
                # Append the result to the data list
                data.append([enzyme_name, fractal_dimension])

            except Exception as e:
                print(f"An error occurred while processing {pdb_file}: {e}")

    # Create a Pandas DataFrame with enzyme names and fractal dimensions
    df = pd.DataFrame(data, columns=['Enzyme Name', 'Fractal Dimension'])
    
    # Optionally, save the DataFrame to a CSV file
    output_csv = os.path.join(folder_name, 'fractal_dimensions.csv')
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

    return df

# Example usage: assuming 'pdb_files' folder contains downloaded PDB files
fractal_dimension_df = process_pdb_files('enzymes_pdb')
print(fractal_dimension_df)


Estimated Fractal Dimension: 1.6760
Estimated Fractal Dimension: 2.0477
Estimated Fractal Dimension: 1.6463
Estimated Fractal Dimension: 1.4035
Estimated Fractal Dimension: 1.6503
Estimated Fractal Dimension: 1.5137
Estimated Fractal Dimension: 1.9091
Estimated Fractal Dimension: 1.7295
Estimated Fractal Dimension: 1.6001
Estimated Fractal Dimension: 1.5212
Estimated Fractal Dimension: 1.1916
Estimated Fractal Dimension: 1.5139
Estimated Fractal Dimension: 1.8274
Estimated Fractal Dimension: 1.5851
Estimated Fractal Dimension: 1.7901
Estimated Fractal Dimension: 1.6742
Estimated Fractal Dimension: 1.8057
Estimated Fractal Dimension: 1.3971
Estimated Fractal Dimension: 1.8491
Estimated Fractal Dimension: 1.7238
Estimated Fractal Dimension: 1.6990
Estimated Fractal Dimension: 1.8420
Estimated Fractal Dimension: 1.6850
Estimated Fractal Dimension: 1.7378
Estimated Fractal Dimension: 1.5702
Estimated Fractal Dimension: 1.4946
Estimated Fractal Dimension: 1.7023
Estimated Fractal Dimension:

In [87]:
new_enzymes_fractal_dimension = process_pdb_files('new_enzymes_pdb')
print(new_enzymes_fractal_dimension)


Estimated Fractal Dimension: 1.6990
Estimated Fractal Dimension: 1.5054
Estimated Fractal Dimension: 1.6668
Estimated Fractal Dimension: 1.6743
Estimated Fractal Dimension: 1.7987
Estimated Fractal Dimension: 1.4946
Estimated Fractal Dimension: 1.6163
Estimated Fractal Dimension: 1.9426
Estimated Fractal Dimension: 1.5702
Estimated Fractal Dimension: 1.3416
Estimated Fractal Dimension: 1.5331
Estimated Fractal Dimension: 1.5379
Estimated Fractal Dimension: 1.1916
Estimated Fractal Dimension: 1.8093
Estimated Fractal Dimension: 1.3971
Estimated Fractal Dimension: 1.8271
Estimated Fractal Dimension: 1.8238
Estimated Fractal Dimension: 1.7378
Estimated Fractal Dimension: 1.8565
Estimated Fractal Dimension: 1.7359
Estimated Fractal Dimension: 1.6463
Estimated Fractal Dimension: 1.8360
Estimated Fractal Dimension: 1.8057
Estimated Fractal Dimension: 1.8204
Estimated Fractal Dimension: 2.0477
Estimated Fractal Dimension: 1.9608
Estimated Fractal Dimension: 1.8420
Estimated Fractal Dimension:

In [65]:
#View Fractal Dimension values for Enzymes
fractal_dimension_df

Unnamed: 0,Enzyme Name,Fractal Dimension
0,P00760,1.675976
1,P49327,2.047670
2,P23284,1.646331
3,P00918,1.403508
4,P53597,1.650328
...,...,...
85,P62877,1.509540
86,P00352,1.677307
87,P06276,1.836034
88,P00374,1.323120


### Fractal Dimension For TF

In [62]:
# Example usage: assuming 'pdb_files' folder contains downloaded PDB files
tf_fractal_dimension_df = process_pdb_files('TFs_pdb')
print(tf_fractal_dimension_df)


Estimated Fractal Dimension: 1.4720
Estimated Fractal Dimension: 1.4975
Estimated Fractal Dimension: 1.4916
Estimated Fractal Dimension: 1.5215
Estimated Fractal Dimension: 1.6725
Estimated Fractal Dimension: 1.5519
Estimated Fractal Dimension: 1.5756
Estimated Fractal Dimension: 1.6170
Estimated Fractal Dimension: 1.6158
Estimated Fractal Dimension: 1.5367
Estimated Fractal Dimension: 1.6873
Estimated Fractal Dimension: 1.7185
Estimated Fractal Dimension: 1.6383
Estimated Fractal Dimension: 1.5055
Estimated Fractal Dimension: 1.7778
Estimated Fractal Dimension: 1.5469
Estimated Fractal Dimension: 1.5954
Estimated Fractal Dimension: 1.6929
Estimated Fractal Dimension: 1.5373
Estimated Fractal Dimension: 1.6267
Estimated Fractal Dimension: 1.6586
Estimated Fractal Dimension: 1.6488
Estimated Fractal Dimension: 1.4437
Estimated Fractal Dimension: 1.5416
Estimated Fractal Dimension: 1.4563
Estimated Fractal Dimension: 1.5562
Estimated Fractal Dimension: 1.6261
Estimated Fractal Dimension:

In [88]:
# 90 Tfs
new_tf_fractal_dimension_df = process_pdb_files('new_TFs_pdb')
print(new_tf_fractal_dimension_df)

Estimated Fractal Dimension: 1.5345
Estimated Fractal Dimension: 1.5657
Estimated Fractal Dimension: 1.5545
Estimated Fractal Dimension: 1.6267
Estimated Fractal Dimension: 1.6531
Estimated Fractal Dimension: 1.4852
Estimated Fractal Dimension: 1.6170
Estimated Fractal Dimension: 1.7185
Estimated Fractal Dimension: 1.5693
Estimated Fractal Dimension: 1.8709
Estimated Fractal Dimension: 1.5197
Estimated Fractal Dimension: 1.6551
Estimated Fractal Dimension: 1.4867
Estimated Fractal Dimension: 1.6628
Estimated Fractal Dimension: 1.8572
Estimated Fractal Dimension: 1.6929
Estimated Fractal Dimension: 1.6383
Estimated Fractal Dimension: 1.5755
Estimated Fractal Dimension: 1.4734
Estimated Fractal Dimension: 1.5535
Estimated Fractal Dimension: 1.5988
Estimated Fractal Dimension: 1.7116
Estimated Fractal Dimension: 1.6644
Estimated Fractal Dimension: 1.5659
Estimated Fractal Dimension: 1.4878
Estimated Fractal Dimension: 1.6378
Estimated Fractal Dimension: 1.7803
Estimated Fractal Dimension:

In [70]:
enzymes_90

Unnamed: 0,Enzyme_ID,Shannon Entropy,Number of Transitions
0,P00813,1.219976,62
1,P00439,1.424472,77
2,P34897,1.261891,62
3,P21912,1.477573,39
4,P47989,1.419869,206
...,...,...,...
85,P06744,1.356219,77
86,P49247,1.513133,47
87,P00734,1.446652,95
88,P00700,1.584585,24


In [91]:
# Perform a left join on the two DataFrames
updated_file_enzymes = pd.merge(enzymes_90, new_enzymes_fractal_dimension, how='left', left_on='Enzyme_ID', right_on='Enzyme Name')
updated_file_enzymes

Unnamed: 0,Enzyme_ID,Shannon Entropy,Number of Transitions,Enzyme Name,Fractal Dimension
0,P00813,1.219976,62,P00813,1.513691
1,P00439,1.424472,77,P00439,1.656789
2,P34897,1.261891,62,P34897,1.734157
3,P21912,1.477573,39,P21912,1.533105
4,P47989,1.419869,206,P47989,1.909126
...,...,...,...,...,...
85,P06744,1.356219,77,P06744,1.680604
86,P49247,1.513133,47,P49247,1.577839
87,P00734,1.446652,95,P00734,1.799924
88,P00700,1.584585,24,P00700,1.241940


In [92]:
# Dropping the 'Enzyme Name' column
updated_file_enzymes.drop(columns=['Enzyme Name'], inplace=True)

file_path = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/updated_file_enzymes.csv"

# Save the DataFrame to a CSV file
updated_file_enzymes.to_csv(file_path, index=False)


## Transcriptor Factors

In [95]:
new_tf_fractal_dimension_df

Unnamed: 0,Enzyme Name,Fractal Dimension
0,P01106,1.534458
1,O43524,1.565665
2,P15923,1.554529
3,Q08050,1.626657
4,P08047,1.653096
...,...,...
84,Q13485,1.672467
85,P49715,1.512132
86,P37275,1.712454
87,P05412,1.497539


In [98]:
# Check for NaN values in the DataFrame
nan_count = new_tf_fractal_dimension_df.isna().sum()
nan_count

Enzyme Name          0
Fractal Dimension    0
dtype: int64

In [96]:
# Perform a left join on the two DataFrames
updated_file_tfs = pd.merge(tf_90, new_tf_fractal_dimension_df, how='left', left_on='TF_ID', right_on='Enzyme Name')
updated_file_tfs

Unnamed: 0,TF_ID,Shannon Entropy,Number of Transitions,Enzyme Name,Fractal Dimension
0,P22736,1.400063,76,P22736,1.625714
1,Q9H334,1.340514,70,Q9H334,1.658552
2,P48436,1.480286,68,P48436,1.531153
3,P28347,1.358966,64,P28347,1.640730
4,Q06330,1.494394,75,Q06330,1.713782
...,...,...,...,...,...
85,O15350,1.514592,96,O15350,1.641319
86,P16220,1.335435,42,P16220,1.496568
87,Q15672,1.222014,11,Q15672,1.359496
88,P43694,1.473598,65,P43694,1.502082


In [99]:
# Check for NaN values in the DataFrame
nan_count_2 = updated_file_tfs.isna().sum()
nan_count_2

TF_ID                    0
Shannon Entropy          0
Number of Transitions    0
Fractal Dimension        1
dtype: int64

In [97]:
# Dropping the 'Enzyme Name' column
updated_file_tfs.drop(columns=['Enzyme Name'], inplace=True)

file_path = "/home/marshal/protonotebook/Management and analysis of Biological and Medical Data/updated_file_tfs.csv"

# Save the DataFrame to a CSV file
updated_file_tfs.to_csv(file_path, index=False)