# Imports

In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Data Prep (Training)


In [9]:
dataTrain = pd.read_csv('TrainDataset2024.csv', index_col=0)

#MISSING DATA
dataTrain.replace(999, np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
dataTrain = pd.DataFrame(imputer.fit_transform(dataTrain), columns=dataTrain.columns,index=dataTrain.index)


target = dataTrain[['RelapseFreeSurvival (outcome)']]#'pCR (outcome)']]
dataTrain.drop(columns=['pCR (outcome)','RelapseFreeSurvival (outcome)'], axis=1, inplace=True)

key_features = dataTrain[['ER', 'HER2', 'Gene']]
dataTrain.drop(columns=['ER', 'HER2', 'Gene'], axis=1, inplace=True)


In [10]:

#NORMALISATION
normalizer = Normalizer()
vector_normalized_data_train = normalizer.fit_transform(dataTrain)


In [11]:

#FEATURE REDUCTION
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

rfe_class = RFE(estimator=RandomForestRegressor(), n_features_to_select=10) 
data_reduced = rfe_class.fit_transform(vector_normalized_data_train, np.ravel(target))

pca_complete_train = pd.DataFrame(data_reduced, index=dataTrain.index)
pca_complete_train = pd.concat([pca_complete_train, key_features], axis=1)


# Data Prep (Test)


In [12]:
dataTest = pd.read_csv('FinalTestDataset2024.csv', index_col=0)

#MISSING DATA
dataTest.replace(999, np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
dataTest = pd.DataFrame(imputer.fit_transform(dataTest), columns=dataTest.columns,index=dataTest.index)

key_features = dataTest[['ER', 'HER2', 'Gene']]
dataTest.drop(columns=['ER', 'HER2', 'Gene'], axis=1, inplace=True)


In [13]:

#NORMALISATION
normalizer = Normalizer()
vector_normalized_data_test = normalizer.fit_transform(dataTest)


In [14]:

#FEATURE REDUCTION
data_reduced_test = rfe_class.transform(vector_normalized_data_test)

pca_complete_test = pd.DataFrame(data_reduced_test, index=dataTest.index)
pca_complete_test = pd.concat([pca_complete_test, key_features], axis=1)


# Training

In [15]:
model = SVR(kernel='rbf', C=1,epsilon=0.2)

data = pca_complete_train.rename(str,axis="columns") 
    
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Train the model on the training data
model.fit(X_train, np.ravel(y_train))

# Predict on the test set
data = pca_complete_test.rename(str,axis="columns") 
predictions = model.predict(data)

results = pd.DataFrame({
    'Index': data.index,
    'Prediction': predictions
})

results.to_csv('RFSPrediction.csv', index=False)