## 0 - Librairies

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

## 1 - Data Preparation

In [None]:
# Charger les données
train_df = pd.read_csv("Swissprot_Train_Validation_dataset.csv")
signals_df = pd.read_csv("SortingSignalsSwissprot.csv")
test_df = pd.read_csv("hpa_testset.csv")

# Fusionner avec les signaux de tri
train_df = pd.merge(train_df, signals_df, on="protein_id", how="left")

# Encoder les localisations subcellulaires
label_encoder = LabelEncoder()
train_df['location_encoded'] = label_encoder.fit_transform(train_df['location'])

# Encoder les signaux de tri (ex. : signal peptide, NLS, etc.)
signal_columns = ['signal_peptide', 'NLS', 'PTS1', 'PTS2', 'TM_helix']
signal_encoder = MultiLabelBinarizer()
train_signals = signal_encoder.fit_transform(train_df[signal_columns].fillna(""))

# Définir un vocabulaire pour les acides aminés
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
vocab = {aa: i+1 for i, aa in enumerate(amino_acids)}

# Encoder les séquences en indices
def encode_sequence(sequence, max_length=1000):
    encoded = np.zeros(max_length, dtype=int)
    for i, aa in enumerate(sequence[:max_length]):
        encoded[i] = vocab.get(aa, 0)  # 0 pour les acides aminés inconnus
    return encoded

X_seq = np.array([encode_sequence(seq) for seq in train_df['sequence']])
X_signals = train_signals
y = train_df['location_encoded'].values

# Diviser en ensembles d'entraînement et de validation
X_seq_train, X_seq_val, X_signals_train, X_signals_val, y_train, y_val = train_test_split(
    X_seq, X_signals, y, test_size=0.2, random_state=42
)
