In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Load dataset
df = pd.read_csv("raw.csv")

# Identify columns
first_col = df.columns[0]   # First column (keep unchanged)
last_col = df.columns[-1]   # Last column (target/output, keep unchanged)

sequence_col = "30mer"      # Sequence column (One-Hot Encoding)
categorical_col = "Target gene"  # Gene column (One-Hot Encoding)

# 🔹 Extract first & last columns separately
df_first = df[[first_col]]
df_last = df[[last_col]]

# 🔹 One-hot encode the 30mer sequences
def one_hot_encode_sequence(seq):
    mapping = {'A': [1,0,0,0], 'T': [0,1,0,0], 'G': [0,0,1,0], 'C': [0,0,0,1]}
    encoded_seq = np.concatenate([mapping[nt] for nt in seq])  # Flatten to 120-dim vector
    return encoded_seq

sequence_ohe = np.array(df[sequence_col].apply(one_hot_encode_sequence).tolist())  # Shape: (num_samples, 120)

# Generate column names for 30mer encoding
sequence_column_names = [f"30mer_{i}" for i in range(sequence_ohe.shape[1])]

# 🔹 One-hot encode the Target_Gene column
gene_encoder = OneHotEncoder(sparse_output=False)
gene_ohe = gene_encoder.fit_transform(df[[categorical_col]])

# Generate column names for one-hot encoded genes
gene_column_names = gene_encoder.get_feature_names_out([categorical_col]).tolist()

# 🔹 Normalize numerical attributes (excluding first & last column)
num_cols = df.columns.difference([first_col, last_col, sequence_col, categorical_col])
scaler = MinMaxScaler()
num_normalized = scaler.fit_transform(df[num_cols])

# Assign meaningful names to numerical attributes
num_column_names = ["Peptide_Percentage", "Amino_Acid_Cut_Position", "Prediction"][:num_normalized.shape[1]]

print(df["Amino Acid Cut position"].min())
print(df["Amino Acid Cut position"].max())

# 🔹 Concatenate all processed features
X_final = np.hstack((df_first.values, sequence_ohe, gene_ohe, num_normalized, df_last.values))

# Create final DataFrame with proper column names
final_column_names = [first_col] + sequence_column_names + gene_column_names + num_column_names + [last_col]
processed_df = pd.DataFrame(X_final, columns=final_column_names)

# Save processed dataset
processed_df.to_csv("processed_data.csv", index=False)

print("Data processing complete! Saved as processed_data.csv")
print("Final shape:", processed_df.shape)


1.0
2826.0
Data processing complete! Saved as processed_data.csv
Final shape: (5310, 141)
