In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file
df = pd.read_csv('Triple_Overlap_DNA_RNA_Project/26_4_summary_statistics.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

Dataset shape: (520, 21)

First few rows:
  data_type protein_id  model_number  iptm   ptm  ranking_score  \
0     RLoop     o00567             0  0.13  0.52           0.34   
1     RLoop     o00567             1  0.12  0.51           0.33   
2     RLoop     o00567             2  0.12  0.51           0.33   
3     RLoop     o00567             3  0.12  0.51           0.33   
4     RLoop     o00567             4  0.12  0.51           0.33   

   fraction_disordered  has_clash  num_recycles  chain_iptm_0  ...  \
0                 0.26          0            10          0.12  ...   
1                 0.26          0            10          0.11  ...   
2                 0.26          0            10          0.10  ...   
3                 0.26          0            10          0.11  ...   
4                 0.26          0            10          0.09  ...   

   chain_ptm_0  chain_ptm_1  chain_pair_iptm_00  chain_pair_iptm_01  \
0         0.59         0.17                0.59                

In [2]:
# Check column names and data types
print("Column names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nBasic info:")
print(df.info())

Column names:
['data_type', 'protein_id', 'model_number', 'iptm', 'ptm', 'ranking_score', 'fraction_disordered', 'has_clash', 'num_recycles', 'chain_iptm_0', 'chain_iptm_1', 'chain_ptm_0', 'chain_ptm_1', 'chain_pair_iptm_00', 'chain_pair_iptm_01', 'chain_pair_iptm_10', 'chain_pair_iptm_11', 'chain_pair_pae_min_00', 'chain_pair_pae_min_01', 'chain_pair_pae_min_10', 'chain_pair_pae_min_11']

Data types:
data_type                 object
protein_id                object
model_number               int64
iptm                     float64
ptm                      float64
ranking_score            float64
fraction_disordered      float64
has_clash                  int64
num_recycles               int64
chain_iptm_0             float64
chain_iptm_1             float64
chain_ptm_0              float64
chain_ptm_1              float64
chain_pair_iptm_00       float64
chain_pair_iptm_01       float64
chain_pair_iptm_10       float64
chain_pair_iptm_11       float64
chain_pair_pae_min_00    float64
c

In [3]:
# Check unique values in key columns
print("Unique data types:", df['data_type'].unique())
print("Number of unique proteins:", df['protein_id'].nunique())
print("Unique proteins:", df['protein_id'].unique())
print("Model numbers:", sorted(df['model_number'].unique()))

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

Unique data types: ['RLoop' 'dsDNA' 'dsRNA' 'ssDNA']
Number of unique proteins: 26
Unique proteins: ['o00567' 'o15042' 'p06748' 'p09874' 'p11387' 'p17844' 'p46013' 'p46087'
 'q01081' 'q14498' 'q14690' 'q86v81' 'q8nfw8' 'q96pk6' 'q96sb4' 'q96t88'
 'q99575' 'q9bqg0' 'q9bze4' 'q9h0a0' 'q9h2u1' 'q9nr30' 'q9nvp1' 'q9ujv9'
 'q9y2x3' 'q9y5b9']
Model numbers: [0, 1, 2, 3, 4]

Missing values:
data_type                0
protein_id               0
model_number             0
iptm                     0
ptm                      0
ranking_score            0
fraction_disordered      0
has_clash                0
num_recycles             0
chain_iptm_0             0
chain_iptm_1             0
chain_ptm_0              0
chain_ptm_1              0
chain_pair_iptm_00       0
chain_pair_iptm_01       0
chain_pair_iptm_10       0
chain_pair_iptm_11       0
chain_pair_pae_min_00    0
chain_pair_pae_min_01    0
chain_pair_pae_min_10    0
chain_pair_pae_min_11    0
dtype: int64


In [4]:
# Filter to only keep model_number 0
df_model0 = df[df['model_number'] == 0].copy()

# Display the results
print("Original dataset shape:", df.shape)
print("Filtered dataset shape (model 0 only):", df_model0.shape)
print("\nFirst few rows of filtered dataset:")
print(df_model0.head())

# Verify we have the expected number of rows (26 proteins × 4 data types = 104 rows)
print(f"\nExpected rows: {df['protein_id'].nunique()} proteins × {df['data_type'].nunique()} data types = {df['protein_id'].nunique() * df['data_type'].nunique()}")
print("Actual rows:", len(df_model0))

# Check the distribution of data types in the filtered dataset
print("\nData type distribution in filtered dataset:")
print(df_model0['data_type'].value_counts())

Original dataset shape: (520, 21)
Filtered dataset shape (model 0 only): (104, 21)

First few rows of filtered dataset:
   data_type protein_id  model_number  iptm   ptm  ranking_score  \
0      RLoop     o00567             0  0.13  0.52           0.34   
5      RLoop     o15042             0  0.21  0.32           0.50   
10     RLoop     p06748             0  0.19  0.31           0.44   
15     RLoop     p09874             0  0.48  0.67           0.57   
20     RLoop     p11387             0  0.20  0.60           0.41   

    fraction_disordered  has_clash  num_recycles  chain_iptm_0  ...  \
0                  0.26          0            10          0.12  ...   
5                  0.53          0            10          0.22  ...   
10                 0.46          0            10          0.20  ...   
15                 0.11          0            10          0.38  ...   
20                 0.27          0            10          0.19  ...   

    chain_ptm_0  chain_ptm_1  chain_pair_ipt

In [5]:
# Separate the filtered dataset by data_type
df_rloop = df_model0[df_model0['data_type'] == 'RLoop'].copy()
df_dsdna = df_model0[df_model0['data_type'] == 'dsDNA'].copy()
df_dsrna = df_model0[df_model0['data_type'] == 'dsRNA'].copy()
df_ssdna = df_model0[df_model0['data_type'] == 'ssDNA'].copy()

# Display information about each separated dataset
print("RLoop dataset:")
print(f"Shape: {df_rloop.shape}")
print(f"Number of proteins: {df_rloop['protein_id'].nunique()}")
print("First few rows:")
print(df_rloop.head())
print("\n" + "="*50 + "\n")

print("dsDNA dataset:")
print(f"Shape: {df_dsdna.shape}")
print(f"Number of proteins: {df_dsdna['protein_id'].nunique()}")
print("First few rows:")
print(df_dsdna.head())
print("\n" + "="*50 + "\n")

print("dsRNA dataset:")
print(f"Shape: {df_dsrna.shape}")
print(f"Number of proteins: {df_dsrna['protein_id'].nunique()}")
print("First few rows:")
print(df_dsrna.head())
print("\n" + "="*50 + "\n")

print("ssDNA dataset:")
print(f"Shape: {df_ssdna.shape}")
print(f"Number of proteins: {df_ssdna['protein_id'].nunique()}")
print("First few rows:")
print(df_ssdna.head())

RLoop dataset:
Shape: (26, 21)
Number of proteins: 26
First few rows:
   data_type protein_id  model_number  iptm   ptm  ranking_score  \
0      RLoop     o00567             0  0.13  0.52           0.34   
5      RLoop     o15042             0  0.21  0.32           0.50   
10     RLoop     p06748             0  0.19  0.31           0.44   
15     RLoop     p09874             0  0.48  0.67           0.57   
20     RLoop     p11387             0  0.20  0.60           0.41   

    fraction_disordered  has_clash  num_recycles  chain_iptm_0  ...  \
0                  0.26          0            10          0.12  ...   
5                  0.53          0            10          0.22  ...   
10                 0.46          0            10          0.20  ...   
15                 0.11          0            10          0.38  ...   
20                 0.27          0            10          0.19  ...   

    chain_ptm_0  chain_ptm_1  chain_pair_iptm_00  chain_pair_iptm_01  \
0          0.59       

In [6]:
# Save all datasets to a single Excel file with multiple sheets
with pd.ExcelWriter('model0_by_data_type.xlsx', engine='openpyxl') as writer:
    df_rloop.to_excel(writer, sheet_name='RLoop', index=False)
    df_dsdna.to_excel(writer, sheet_name='dsDNA', index=False)
    df_dsrna.to_excel(writer, sheet_name='dsRNA', index=False)
    df_ssdna.to_excel(writer, sheet_name='ssDNA', index=False)

print("Excel file 'model0_by_data_type.xlsx' created with 4 sheets!")

Excel file 'model0_by_data_type.xlsx' created with 4 sheets!


In [7]:
# Create a summary sheet with all data types side by side
# First, let's create the individual dataframes with only the columns we want
rloop_summary = df_rloop[['protein_id', 'iptm', 'ptm', 'ranking_score']].copy()
dsdna_summary = df_dsdna[['protein_id', 'iptm', 'ptm', 'ranking_score']].copy()
dsrna_summary = df_dsrna[['protein_id', 'iptm', 'ptm', 'ranking_score']].copy()
ssdna_summary = df_ssdna[['protein_id', 'iptm', 'ptm', 'ranking_score']].copy()

# Rename columns to avoid conflicts
rloop_summary.columns = ['RLoop_protein_id', 'RLoop_iptm', 'RLoop_ptm', 'RLoop_ranking_score']
dsdna_summary.columns = ['dsDNA_protein_id', 'dsDNA_iptm', 'dsDNA_ptm', 'dsDNA_ranking_score']
dsrna_summary.columns = ['dsRNA_protein_id', 'dsRNA_iptm', 'dsRNA_ptm', 'dsRNA_ranking_score']
ssdna_summary.columns = ['ssDNA_protein_id', 'ssDNA_iptm', 'ssDNA_ptm', 'ssDNA_ranking_score']

# Combine all dataframes side by side
summary_df = pd.concat([rloop_summary, dsdna_summary, dsrna_summary, ssdna_summary], axis=1)

# Display the summary dataframe
print("Summary sheet preview:")
print(summary_df.head())
print(f"\nShape: {summary_df.shape}")

# Save to the same Excel file with a new sheet
with pd.ExcelWriter('model0_by_data_type.xlsx', engine='openpyxl', mode='a') as writer:
    summary_df.to_excel(writer, sheet_name='Summary', index=False)

print("\nSummary sheet added to 'model0_by_data_type.xlsx'!")

Summary sheet preview:
   RLoop_protein_id  RLoop_iptm  RLoop_ptm  RLoop_ranking_score  \
0            o00567        0.13       0.52                 0.34   
5            o15042        0.21       0.32                 0.50   
10           p06748        0.19       0.31                 0.44   
15           p09874        0.48       0.67                 0.57   
20           p11387        0.20       0.60                 0.41   

   dsDNA_protein_id  dsDNA_iptm  dsDNA_ptm  dsDNA_ranking_score  \
0               NaN         NaN        NaN                  NaN   
5               NaN         NaN        NaN                  NaN   
10              NaN         NaN        NaN                  NaN   
15              NaN         NaN        NaN                  NaN   
20              NaN         NaN        NaN                  NaN   

   dsRNA_protein_id  dsRNA_iptm  dsRNA_ptm  dsRNA_ranking_score  \
0               NaN         NaN        NaN                  NaN   
5               NaN         NaN      