In [6]:
# In test_preprocess.ipynb

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

# --- Step 1: Load Raw Data ---
# We need train.csv to fit the scaler correctly
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# --- Step 2: Preprocess Both DataFrames ---
# Map 'Lifestyle Activities' in both dataframes from categorical to numerical
train_df['Lifestyle Activities'] = train_df['Lifestyle Activities'].map({'Yes': 1, 'No': 0})
test_df['Lifestyle Activities'] = test_df['Lifestyle Activities'].map({'Yes': 1, 'No': 0})

# --- Step 3: Fit Scaler and Transform Test Data ---
# Define the feature columns that need to be scaled
feature_cols = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data
scaler.fit(train_df[feature_cols])

# Transform the test data using the fitted scaler
test_data_scaled = scaler.transform(test_df[feature_cols])

# --- Step 4: Create and Save the Processed Test DataFrame ---
# Convert the scaled test data back to a DataFrame
processed_test_df = pd.DataFrame(test_data_scaled, columns=feature_cols)

# IMPORTANT: Add the 'Id' column back for the submission file
processed_test_df.insert(0, 'Id', test_df['Id'])

# Save the final processed test data to a new CSV file
processed_test_df.to_csv('processed_test.csv', index=False)

print("'processed_test.csv' created successfully.")
processed_test_df.head()

'processed_test.csv' created successfully.


Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions
0,6253,0.006455,-0.030618,-0.989307,0.858683,-0.91452
1,4685,-1.149747,-1.356828,1.010808,-1.496494,1.182603
2,1732,0.777257,-0.780215,1.010808,0.269888,0.134041
3,4743,0.391856,-1.587474,1.010808,0.858683,0.134041
4,4522,0.777257,-0.953199,-0.989307,-1.496494,0.483562
