## Protein Processing Filteration

In [175]:
import pandas as pd
import numpy as np 
import os


### Files to read|

In [176]:
filesList = ['IPIgGMS2','IPMWMS2']

# Folder Creation 
rootFolder = 'Dataset2 ( Stomach )'
os.makedirs(rootFolder)
# Sub Folder
part1Folder = 'Part 1'
part2Folder = 'Part 2'
finalScreenData = 'Final Screen Data'
os.makedirs(os.path.join(rootFolder, part1Folder))
os.makedirs(os.path.join(rootFolder, part2Folder))
os.makedirs(os.path.join(rootFolder, finalScreenData))



## Common:

In [177]:
# Define the path to the folder
folder_path = 'DATASETS/DATASET2_STOMACH'

# List all files in the folder
files = os.listdir(folder_path)

# Assuming you want to read the first file in the folder
if files:
#     file_to_read = files[0]
    print('reading female specimen')
    CONTROL_FILE= os.path.join(folder_path, 'IPIgGFS2.xls')
    EXPERIMENT_FILE = os.path.join(folder_path, 'IPMWFS2.xls')
    CONTROL_DATA = pd.read_excel(CONTROL_FILE)
    EXPERIMENT_DATA = pd.read_excel(EXPERIMENT_FILE)
    print(CONTROL_DATA.head())  # Displaying the first few rows of the DataFrame
    print(EXPERIMENT_DATA.head())

reading female specimen
  accession       entry  coverage  proteinProbability  totalPeptides  \
0    P01837  IGKC_MOUSE     36.45              1.0000              3   
1    P60710  ACTB_MOUSE     63.47              1.0000             21   
2    Q9CQP0  RM33_MOUSE     13.85              0.9987              1   
3    P01942   HBA_MOUSE     42.96              1.0000              6   
4    Q9CR36  GKN1_MOUSE     29.35              1.0000              4   

   uniquePeptides  razorPeptides  totalSpectralCount  uniqueSpectralCount  \
0               3              3                  20                   20   
1               1             21                  93                    2   
2               1              1                   1                    1   
3               5              6                  25                   24   
4               4              4                  31                   31   

   razorSpectralCount  ...  uniqueIntensity  razorIntensity  \
0                

## Primary Screening For exclusive proteins

In [178]:
# Extract the 'entry' column from each DataFrame
control_subset = CONTROL_DATA[['entry', 'coverage', 'totalPeptides']]
experiment_subset = EXPERIMENT_DATA[['entry', 'coverage', 'totalPeptides']]

#Difference
# Find proteins present in the experiment dataset but not in the control dataset
missing_proteins = experiment_subset[~experiment_subset['entry'].isin(control_subset['entry'])]

# Total Peptides >= 10 in missing proteins 
tPCount = missing_proteins[missing_proteins['totalPeptides'] >=10]
missing_proteins = tPCount.sort_values(by='coverage',ascending=False)
print(missing_proteins)



            entry  coverage  totalPeptides
188   TBB2A_MOUSE     39.78             16
189   TBB2B_MOUSE     39.78             16
227   GRM2B_MOUSE     31.24             10
202    TBB6_MOUSE     27.29             12
482   NCOA5_MOUSE     25.04             13
620   ASTRB_MOUSE     19.65             12
187   K2C6A_MOUSE     16.46             10
1319  MYO1A_MOUSE     14.96             17
1378   ERN2_MOUSE     13.39             10
591   MYH10_MOUSE      8.45             16
2129  AKAP9_MOUSE      8.40             29
1921   PYR1_MOUSE      8.36             17
2205  ITPR3_MOUSE      4.79             13


In [198]:
# Assuming 'missing_proteins' is your DataFrame
def saveInFolder(dataFrame,fileName,partName):
# Define the folder path relative to the root directory
    rootF = os.getcwd() # Replace '/path/to/root/folder' with the actual root folder path
    folder_path = os.path.join(rootF, rootFolder , partName)

    # Define the full file path
    file_path = os.path.join(folder_path, file_name)

    # Ensure that the folder exists, if not create it
    os.makedirs(folder_path, exist_ok=True)

    # Save the DataFrame to an Excel file in the specified folder
    dataFrame.to_excel(file_path, index=False)

    print(f"{fileName} is saved to:", file_path)


In [None]:
saveInFolder(missing_proteins,'ExclusiveFS2',rootFolder,part1Folder)

## Primary Screening For Similarity 

In [200]:

# Merge datasets based on the 'entry' column to get common entries
common_entries = pd.merge(CONTROL_DATA, EXPERIMENT_DATA, on='entry', suffixes=('_control', '_experiment'))

# Extract the 'totalPeptide' columns for common entries
common_entries_total_peptide = common_entries[['entry', 'totalPeptides_control', 'totalPeptides_experiment']]
print(common_entries_total_peptide)
saveInFolder(common_entries_total_peptide,'test21',part1Folder)
# #Similarity 
# # Finding Proteins present in Both Data Set.
# similar_proteins = experiment_subset[experiment_subset['entry'].isin(control_subset['entry'])]

# print(similar_proteins)
# similar_proteins2 = control_subset[control_subset['entry'].isin(experiment_subset['entry'])]
# print(similar_proteins2)

# # newFrame = pd.DataFrame(columns=['Control_Entry','Coverage','totalPeptides', 'Experiment_Entry','Coverage','totalPeptides'])
# # newFrame

            entry  totalPeptides_control  totalPeptides_experiment
0      IGKC_MOUSE                      3                         7
1      ACTB_MOUSE                     21                        22
2      RM33_MOUSE                      1                         1
3       HBA_MOUSE                      6                         6
4      GKN1_MOUSE                      4                         4
...           ...                    ...                       ...
2031   RET7_MOUSE                      1                         1
2032   NU1M_MOUSE                      1                         1
2033   PTTG_MOUSE                      1                         1
2034   IF3M_MOUSE                      1                         1
2035  ASGL1_MOUSE                      1                         1

[2036 rows x 3 columns]
test21 is saved to: C:\Users\Jawwad\Desktop\Zaid Bhai Research\Dataset2 ( Stomach )\Part 1\ExclusiveFS2.xlsx


In [181]:
# Assuming similar_proteins2 contains control data and similar_proteins contains experiment data

# Rename columns of similar_proteins2 (control data)

similar_proteins2_renamed = similar_proteins2.rename(columns={
    'entry': 'Control_Entry',
    'coverage': 'C-coverage',
    'totalPeptides': 'C-totalPeptides'
})

# Rename columns of similar_proteins (experiment data)
similar_proteins_renamed = similar_proteins.rename(columns={
    'entry': 'Experiment_Entry',
    'coverage': 'E-coverage',
    'totalPeptides': 'E-totalPeptides'
})
similar_proteins = similar_proteins_renamed.sort_values(by='Experiment_Entry',ascending=True)

similar_proteins2 = similar_proteins2_renamed.sort_values(by='Control_Entry',ascending=True)

tPCount = similar_proteins2_renamed[similar_proteins2_renamed['C-totalPeptides'] >=10]
similar_proteins2 = tPCount
print(similar_proteins2)

tPCount = similar_proteins_renamed[similar_proteins_renamed['E-totalPeptides'] >=10]
similar_proteins = tPCount.sort_values(by='E-coverage',ascending=False)
print(similar_proteins)



     Control_Entry  C-coverage  C-totalPeptides
1       ACTB_MOUSE       63.47               21
10      TAGL_MOUSE       73.63               21
16     NUCB1_MOUSE       24.18               10
26      ALBU_MOUSE       32.24               20
45     GSTO1_MOUSE       42.92               13
...            ...         ...              ...
1666   MYO1D_MOUSE       11.33               10
1751    PLEC_MOUSE       14.30               64
1848    DESP_MOUSE       10.65               31
1898   MY18A_MOUSE        6.63               12
2048   DYHC1_MOUSE        3.21               13

[250 rows x 3 columns]
     Experiment_Entry  E-coverage  E-totalPeptides
336        GPX1_MOUSE       70.15               12
54        K1C19_MOUSE       66.75               29
29         TAGL_MOUSE       66.67               20
4          ACTB_MOUSE       63.20               22
240        CBR1_MOUSE       61.37               14
...               ...         ...              ...
1874       ITB4_MOUSE        8.09          

In [182]:
# to rewrite:

# Concatenate control and experiment data side by side
combined_data = pd.concat([similar_proteins2, similar_proteins], axis=1)
print(combined_data)
# Create a Pandas Excel writer object



     Control_Entry  C-coverage  C-totalPeptides Experiment_Entry  E-coverage  \
1       ACTB_MOUSE       63.47             21.0              NaN         NaN   
4              NaN         NaN              NaN       ACTB_MOUSE       63.20   
10      TAGL_MOUSE       73.63             21.0              NaN         NaN   
11             NaN         NaN              NaN      ACTBL_MOUSE       28.99   
16     NUCB1_MOUSE       24.18             10.0              NaN         NaN   
...            ...         ...              ...              ...         ...   
2046           NaN         NaN              NaN      UGGG1_MOUSE        6.96   
2048   DYHC1_MOUSE        3.21             13.0              NaN         NaN   
2060           NaN         NaN              NaN      MYO5B_MOUSE        7.21   
2062           NaN         NaN              NaN      EPIPL_MOUSE       16.13   
2198           NaN         NaN              NaN       GCN1_MOUSE        7.30   

      E-totalPeptides  
1              

In [183]:
def saveInSheet(dataFrame,file_name,partName,sheetName):
    root_folder = os.getcwd() # Replace '/path/to/root/folder' with the actual root folder path
    folder_path = os.path.join(root_folder, 'Dataset2 ( Stomach )', partName)

    # Define the full file path
    file_path = os.path.join(folder_path, file_name)

    # Ensure that the folder exists, if not create it
    os.makedirs(folder_path, exist_ok=True)

    # Save the DataFrame to an Excel file in the specified folder
    
    with pd.ExcelWriter(file_name) as writer:
        # Save the missing proteins data to the first sheet
        dataFrame.to_excel(file_path, sheet_name=sheetName, index=False)

    print(f"{file_name} is saved.")
saveInSheet(combined_data,'SimilarFS2.xlsx',part1Folder,'Primary Screening')

SimilarFS2.xlsx is saved.


## Saving into two new Files on sheet 1

In [185]:
similar_proteins = experiment_subset[experiment_subset['entry'].isin(control_subset['entry'])]
tPCount = similar_proteins[similar_proteins['totalPeptides'] >=10]
simi = tPCount.sort_values(by='coverage',ascending=False)
similar_proteins_copy = simi.copy()


# # Modify the copy
similar_proteins_copy['Ratio Mw/IpG'] = experiment_subset['totalPeptides'] / control_subset['totalPeptides']
s = similar_proteins_copy[similar_proteins_copy['Ratio Mw/IpG'] >=0.5]

# Create a Pandas Excel writer object
saveInSheet(s,'SimilarFS2.xlsx',part2Folder,'Secondary Screening')
    


SimilarFS2.xlsx is saved.


            entry  coverage  totalPeptides
188   TBB2A_MOUSE     39.78             16
189   TBB2B_MOUSE     39.78             16
227   GRM2B_MOUSE     31.24             10
202    TBB6_MOUSE     27.29             12
482   NCOA5_MOUSE     25.04             13
620   ASTRB_MOUSE     19.65             12
187   K2C6A_MOUSE     16.46             10
1319  MYO1A_MOUSE     14.96             17
1378   ERN2_MOUSE     13.39             10
591   MYH10_MOUSE      8.45             16
2129  AKAP9_MOUSE      8.40             29
1921   PYR1_MOUSE      8.36             17
2205  ITPR3_MOUSE      4.79             13


FileNotFoundError: [Errno 2] No such file or directory: 'ScreenedDataDifferenceMS2.xlsx'

Unnamed: 0,entry,coverage,totalPeptides,Ratio Mw/Ipg
1,ACTB_MOUSE,70.40,25,0.961538
2,ACTS_MOUSE,66.05,21,0.840000
4,ACTH_MOUSE,66.22,21,1.615385
5,ACTBL_MOUSE,32.18,11,1.222222
6,IGG2B_MOUSE,17.08,7,2.333333
...,...,...,...,...
1389,COL_MOUSE,7.08,1,1.000000
1390,BST2_MOUSE,5.23,1,1.000000
1395,S10AB_MOUSE,7.14,1,1.000000
1400,UBP19_MOUSE,0.88,1,0.500000


In [66]:
#to save into new sheet of same excel file
with pd.ExcelWriter('ScreenedDataSimilarMS2.xlsx',engine='openpyxl', mode='a') as writer:
    # Save the missing proteins data to the first sheet
    similar_proteins.to_excel(writer, sheet_name='Secondary Screening', index=False)
    
print("ScreenedDataSimilarMS2.xlsx is Saved")

ScreenedDataSimilarMS2.xlsx is Saved


Notes:
1)Sheet 1 : Primary Screening : -
- Exclusive for difference Proteins Present in MWML2 | Total Peptide  :
- 
- Similar Experiment group TotalPeptide count >= 10: refine
- One file for  Present in both IgG/Mw | Ratio  
- file name  e.g : will be Similar ML2 and Exclusive ML2
- Coverage column needs to be there

2) Sheet 2 : Secondary Screening :
- male and female data comparasion . similar with similar and exlusive with exclusive .
- Protein Present Male  [exlusive] | Common | Protein Present Present in Female only 
- No totalPeptide
- Folder Name : Dataset 2 (lung) 
    - Part 1 
    - Part 2 
    - Final Screen Data 
- Dataset (Stomach)
- Dataset 2 (Stomach) 



## Secondary Screening 