In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
from utils.pca_utils import k_components
import os

In [9]:
df_train = pd.read_csv(f'train.csv')
df_val = pd.read_csv(f'val.csv')
df_test = pd.read_csv(f'test.csv')

target = 'What is your intake?'

In [10]:
X_train, y_train = df_train.drop(columns=target), df_train[target]
X_val, y_val = df_val.drop(columns=target), df_val[target]
X_test, y_test = df_test.drop(columns=target), df_test[target]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

k = k_components(target, df_train, 0.9)

pca = PCA(n_components=k)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

train_pca = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(k)])
train_pca[target] = y_train.reset_index(drop=True)

val_pca = pd.DataFrame(X_val_pca, columns=[f'PC{i+1}' for i in range(k)])
val_pca[target] = y_val.reset_index(drop=True)

test_pca = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(k)])
test_pca[target] = y_test.reset_index(drop=True)

train_pca.to_csv(f"train_pca.csv", index=False)
val_pca.to_csv(f"val_pca.csv", index=False)
test_pca.to_csv(f"test_pca.csv", index=False)

# Define the folder path and model name
folder_name = "pca_ckpt"
model_file_name = "pca_model.pkl"

# Create the folder if it doesn't exist
os.makedirs(folder_name, exist_ok=True)

# Save the model
path = os.path.join(folder_name, model_file_name)

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, path)


k = 14


['pca_ckpt\\pca_model.pkl']