In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)


In [2]:
# Load datasets
bcell_df = pd.read_csv("../data/input_bcell.csv")
sars_df = pd.read_csv("../data/input_sars.csv")
covid_df = pd.read_csv("../data/input_covid.csv")  # No labels

# Display dataset shapes
print("B-cell Data Shape:", bcell_df.shape)
print("SARS Data Shape:", sars_df.shape)
print("COVID Test Data Shape:", covid_df.shape)

# Show first few rows
bcell_df.head()


B-cell Data Shape: (14387, 14)
SARS Data Shape: (520, 14)
COVID Test Data Shape: (20312, 13)


Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target
0,A2T3T0,MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEF...,161,165,SASFT,1.016,0.703,1.018,2.22,5.810364,0.103275,-0.143829,40.2733,1
1,F0V2I4,MTIHKVAINGFGRIGRLLFRNLLSSQGVQVVAVNDVVDIKVLTHLL...,251,255,LCLKI,0.77,0.179,1.199,-3.86,6.210876,0.065476,-0.036905,24.998512,1
2,O75508,MVATCLQVVGFVTSFVGWIGVIVTTSTNDWVVTCGYTIPTCRKLDE...,145,149,AHRET,0.852,3.427,0.96,4.28,8.223938,0.091787,0.879227,27.863333,1
3,O84462,MTNSISGYQPTVTTSTSSTTSASGASGSLGASSVSTTANATVTQTA...,152,156,SNYDD,1.41,2.548,0.936,6.32,4.237976,0.044776,-0.521393,30.765373,1
4,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,85,89,DGTYR,1.214,1.908,0.937,4.64,6.867493,0.103846,-0.578846,21.684615,1


In [3]:
# Check for missing values
print("Missing values in B-cell dataset:")
print(bcell_df.isnull().sum())

print("\nMissing values in SARS dataset:")
print(sars_df.isnull().sum())


Missing values in B-cell dataset:
parent_protein_id      0
protein_seq            0
start_position         0
end_position           0
peptide_seq            0
chou_fasman            0
emini                  0
kolaskar_tongaonkar    0
parker                 0
isoelectric_point      0
aromaticity            0
hydrophobicity         0
stability              0
target                 0
dtype: int64

Missing values in SARS dataset:
parent_protein_id      0
protein_seq            0
start_position         0
end_position           0
peptide_seq            0
chou_fasman            0
emini                  0
kolaskar_tongaonkar    0
parker                 0
isoelectric_point      0
aromaticity            0
hydrophobicity         0
stability              0
target                 0
dtype: int64


In [4]:
# Merge B-cell and SARS data
train_df = pd.concat([bcell_df, sars_df], axis=0, ignore_index=True)

# Shuffle the dataset
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Final Training Dataset Shape:", train_df.shape)


Final Training Dataset Shape: (14907, 14)


In [6]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns
num_cols = ["isoelectric_point", "aromaticity", "hydrophobicity", "stability", "chou_fasman", "emini", "kolaskar_tongaonkar", "parker"]

# Initialize scaler
scaler = MinMaxScaler()

# Apply scaling
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])

# Save processed dataset
train_df.to_csv("../data/processed_train.csv", index=False)


In [7]:
%store train_df


Stored 'train_df' (DataFrame)
