In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the datasets
train_path = "data4-Bio/TrainSet.csv"
test_path = "data4-Bio/TestSet.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [8]:
# Convert SCm and SCd to numeric, coercing errors to NaN to detect null values properly
train_df["SCm"] = pd.to_numeric(train_df["SCm"], errors="coerce")
train_df["SCd"] = pd.to_numeric(train_df["SCd"], errors="coerce")

test_df["SCm"] = pd.to_numeric(test_df["SCm"], errors="coerce")
test_df["SCd"] = pd.to_numeric(test_df["SCd"], errors="coerce")

# Recalculate null values after conversion
train_nulls = train_df.isnull().sum()
test_nulls = test_df.isnull().sum()

# Filter to only include columns with null values
train_nulls = train_nulls[train_nulls > 0]
test_nulls = test_nulls[test_nulls > 0]

# Calculate the proportion of missing values
train_nulls_percentage = (train_nulls / len(train_df)) * 100
test_nulls_percentage = (test_nulls / len(test_df)) * 100

train_nulls, train_nulls_percentage, test_nulls, test_nulls_percentage

(Tm      2
 Td      2
 Im     55
 Id     55
 Bm      4
 Bd      4
 SCm     4
 SCd     4
 dtype: int64,
 Tm      3.333333
 Td      3.333333
 Im     91.666667
 Id     91.666667
 Bm      6.666667
 Bd      6.666667
 SCm     6.666667
 SCd     6.666667
 dtype: float64,
 Tm      1
 Td      1
 Hm      1
 Im     20
 Id     20
 Bm      2
 Bd      2
 SCm     2
 SCd     2
 dtype: int64,
 Tm       5.0
 Td       5.0
 Hm       5.0
 Im     100.0
 Id     100.0
 Bm      10.0
 Bd      10.0
 SCm     10.0
 SCd     10.0
 dtype: float64)

***OBSERVATIONS***
- We observe that columns Tm, Td, Bm, Bd, SCm, SCd have just a few null values, so we will impute them using the median.
- Columns Im and Id have a lot of null values (91.7%) so it makes no sense to keept this columns to train our models. Hence, we will remove them.
- Also note that column Cm has all the rows with the same value which is 0, both in the train and test dataset. Therefore, it makes no sense to train our models using this feature, so we will remove it.
- Also, the first column represents the name which is not informative on the training of the model.
- Then we will encode the class column in order to have it as numerical data
- Finally we will normalize all the columns since we found that some values are really small and some are really big.

In [16]:
from sklearn.preprocessing import LabelEncoder

# Impute missing values in selected columns with the median
columns_to_impute = ["Tm", "Td", "Bm", "Bd", "SCm", "SCd"]

for col in columns_to_impute:
    median_value = train_df[col].median()  # Compute median from train dataset
    train_df[col].fillna(median_value, inplace=True)
    test_df[col].fillna(median_value, inplace=True)  # Use same median for test set
test_df["Hm"].fillna(train_df["Hm"].median(), inplace=True)  # Impute with mode
# Remove columns with too many missing values
columns_to_remove = ["Im", "Id", "Cm", "File_Name"]

# Remove from both train and test datasets
train_df.drop(columns=columns_to_remove, inplace=True, errors="ignore")
test_df.drop(columns=columns_to_remove, inplace=True, errors="ignore")


# Encode the target column 'class' in the train dataset
label_encoder = LabelEncoder()
train_df["class"] = label_encoder.fit_transform(train_df["class"])

display(train_df.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median_value, inplace=True)  # Use same median for test set
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

Unnamed: 0,Em,Ed,El,LEm,ATm,Asm,ASd,EDm,FPm,FMm,...,D7m,RSm,RSd,RRm,RRd,RTm,RTd,RGm,RGd,class
0,0.228523,0.005756,0.003345,0.563593,0.057984,3457377.0,2521368.0,5.243943,866.0,2565.374571,...,-0.000118,43.800431,21.912391,55.806736,57.279056,41.248632,21.913723,43.028042,22.84761,0
1,0.231866,0.007204,0.005113,0.529508,0.049326,6534186.0,3843431.0,7.434451,1345.0,3367.889134,...,0.000176,28.242499,12.775416,15.886451,13.327719,36.544511,13.637614,36.336147,13.062584,0
2,0.332185,0.006607,0.00447,0.493776,0.048627,7084042.0,3847330.0,7.334882,1403.0,3417.22396,...,-0.00041,48.627341,27.925648,55.329468,45.52286,41.719539,19.529773,43.087762,20.105442,0
3,0.34716,0.008825,0.004168,0.528738,0.058384,6538634.0,4441305.0,4.825737,627.0,2225.553866,...,0.000496,39.098973,16.653536,22.60641,13.184925,43.17858,13.889276,52.134777,17.766253,0
4,0.307836,0.009545,0.006795,0.624839,0.0676,10569260.0,10348900.0,3.910991,1214.0,4333.999027,...,-0.000238,70.602915,55.540702,84.173088,74.933277,89.99585,69.841522,91.929329,66.362908,0


In [17]:
# Save processed datasets
train_df.to_csv("Processed_TrainSet.csv", index=False)
test_df.to_csv("Processed_TestSet.csv", index=False)

print("Preprocessing completed. Data is cleaned and ready for feature selection.")

Preprocessing completed. Data is cleaned and ready for feature selection.
