In [1]:
import pandas as pd
import numpy as np
import itertools


In [2]:
#import csv file
try:
    df_train = pd.read_csv('../data/train.csv')
    df_test = pd.read_csv('../data/test.csv')
except FileNotFoundError:
    print("Error: files not found")
    exit()

df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    8000 non-null   int64 
 1   Therapy Hours         8000 non-null   int64 
 2   Initial Health Score  8000 non-null   int64 
 3   Lifestyle Activities  8000 non-null   object
 4   Average Sleep Hours   8000 non-null   int64 
 5   Follow-Up Sessions    8000 non-null   int64 
 6   Recovery Index        8000 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 437.6+ KB


Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,5,49,No,7,5,36
1,1562,2,48,Yes,7,6,25
2,1671,2,81,No,7,2,59
3,6088,2,46,No,6,1,22
4,6670,8,47,No,9,0,40


In [3]:
test_ids = df_test['Id']

df_train_id_dropped = df_train.drop('Id', axis=1)
df_test_id_dropped = df_test.drop('Id', axis=1)

for df in [df_train_id_dropped, df_test_id_dropped]:
    df['Lifestyle Activities'] = df['Lifestyle Activities'].apply(lambda x: 1 if x == 'Yes' else 0)

df_train_id_dropped.info()
df_train_id_dropped.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Therapy Hours         8000 non-null   int64
 1   Initial Health Score  8000 non-null   int64
 2   Lifestyle Activities  8000 non-null   int64
 3   Average Sleep Hours   8000 non-null   int64
 4   Follow-Up Sessions    8000 non-null   int64
 5   Recovery Index        8000 non-null   int64
dtypes: int64(6)
memory usage: 375.1 KB


Unnamed: 0,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,5,49,0,7,5,36
1,2,48,1,7,6,25
2,2,81,0,7,2,59
3,2,46,0,6,1,22
4,8,47,0,9,0,40


In [4]:
# --- Feature Engineering: Add Squared Features ---

print("Original data shape:", df_train_id_dropped.shape)

# List of numerical features to square (everything except the 0/1 'Lifestyle Activities')
features_to_square = [
    'Therapy Hours', 
    'Initial Health Score', 
    'Average Sleep Hours', 
    'Follow-Up Sessions'
]

for df in [df_train_id_dropped, df_test_id_dropped]:
    for col in features_to_square:
        # Create a new column name, e.g., "Therapy Hours_sq"
        new_col_name = f"{col}_sq"
        
        # Create the new column by squaring the original
        df[new_col_name] = df[col] ** 2

print("New features added.")
print("New data shape:", df_train_id_dropped.shape)

# Show the first few rows to confirm the new columns
print("\n--- Training Data with New Squared Features ---")
display(df_train_id_dropped.head())
# 1. Save the processed training data
train_output_path = '../data/train_processed_v2.csv'
df_train_id_dropped.to_csv(train_output_path, index=False)
print(f"Processed training data saved to: {train_output_path}")

# 2. Save the processed test data
test_output_path = '../data/test_processed_v2.csv'
df_test_id_dropped.to_csv(test_output_path, index=False)
print(f"Processed test data saved to: {test_output_path}")

Original data shape: (8000, 6)
New features added.
New data shape: (8000, 10)

--- Training Data with New Squared Features ---


Unnamed: 0,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index,Therapy Hours_sq,Initial Health Score_sq,Average Sleep Hours_sq,Follow-Up Sessions_sq
0,5,49,0,7,5,36,25,2401,49,25
1,2,48,1,7,6,25,4,2304,49,36
2,2,81,0,7,2,59,4,6561,49,4
3,2,46,0,6,1,22,4,2116,36,1
4,8,47,0,9,0,40,64,2209,81,0


Processed training data saved to: ../data/train_processed_v2.csv
Processed test data saved to: ../data/test_processed_v2.csv


In [5]:
# --- Feature Engineering: Add 5C2 Interaction Features ---

print("Original data shape:", df_train_id_dropped.shape)

# Your original list of 5 features
features = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions']

# Get all 2-feature combinations (10 pairs)
all_combinations = list(itertools.combinations(features, 2))
print(f"Generating {len(all_combinations)} new interaction features...")

# Loop over both the training and test DataFrames
for df in [df_train_id_dropped, df_test_id_dropped]:
    # Loop over every pair, e.g., ('Therapy Hours', 'Initial Health Score')
    for col1, col2 in all_combinations:
        
        # Create a new, clean column name, e.g., "Therapy_Hours_x_Initial_Health_Score"
        new_col_name = f"{col1}_x_{col2}".replace(' ', '_')
        
        # Create the new feature by multiplying the two columns
        df[new_col_name] = df[col1] * df[col2]

print("All 10 new interaction features added.")
print("New data shape:", df_train_id_dropped.shape)

# Show the first few rows to confirm the new columns
print("\n--- Training Data with New Interaction Features ---")
display(df_train_id_dropped.head())

Original data shape: (8000, 10)
Generating 10 new interaction features...
All 10 new interaction features added.
New data shape: (8000, 20)

--- Training Data with New Interaction Features ---


Unnamed: 0,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index,Therapy Hours_sq,Initial Health Score_sq,Average Sleep Hours_sq,Follow-Up Sessions_sq,Therapy_Hours_x_Initial_Health_Score,Therapy_Hours_x_Lifestyle_Activities,Therapy_Hours_x_Average_Sleep_Hours,Therapy_Hours_x_Follow-Up_Sessions,Initial_Health_Score_x_Lifestyle_Activities,Initial_Health_Score_x_Average_Sleep_Hours,Initial_Health_Score_x_Follow-Up_Sessions,Lifestyle_Activities_x_Average_Sleep_Hours,Lifestyle_Activities_x_Follow-Up_Sessions,Average_Sleep_Hours_x_Follow-Up_Sessions
0,5,49,0,7,5,36,25,2401,49,25,245,0,35,25,0,343,245,0,0,35
1,2,48,1,7,6,25,4,2304,49,36,96,2,14,12,48,336,288,7,6,42
2,2,81,0,7,2,59,4,6561,49,4,162,0,14,4,0,567,162,0,0,14
3,2,46,0,6,1,22,4,2116,36,1,92,0,12,2,0,276,46,0,0,6
4,8,47,0,9,0,40,64,2209,81,0,376,0,72,0,0,423,0,0,0,0


In [6]:
# 1. Save the processed training data
train_output_path = '../data/train_processed_v4.csv'
df_train_id_dropped.to_csv(train_output_path, index=False)
print(f"Processed training data saved to: {train_output_path}")

# 2. Save the processed test data
test_output_path = '../data/test_processed_v4.csv'
df_test_id_dropped.to_csv(test_output_path, index=False)
print(f"Processed test data saved to: {test_output_path}")

# 3. Save the test IDs
ids_output_path = '../data/test_ids.csv'
test_ids.to_csv(ids_output_path, index=False)
print(f"Test IDs saved to: {ids_output_path}")

Processed training data saved to: ../data/train_processed_v4.csv
Processed test data saved to: ../data/test_processed_v4.csv
Test IDs saved to: ../data/test_ids.csv
