# Split the data first

In [1]:
# Load data
import duckdb

con = duckdb.connect("../database/prediction_data.duckdb")

df = con.sql("SELECT * FROM prediction_data").df()
con.close()

In [2]:
import duckdb
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Ensure folder exists
os.makedirs("../database/ML", exist_ok=True)

# Your dataframe after filtering the 10 companies
df_clean = df.copy()

X = df_clean.drop('company_response', axis=1)
y = df_clean['company_response']

# Split 70% train, 30% temp
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)

# Convert to DataFrame
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 203816
Test size: 87351


# Encoding
- product: onehot_encoding
- sub_product: target_encoding
- issue: target_encoding
- company: onehot_encoding
- state: target_encoding
- submitted_via: onehot_encoding
- consumer_consent_provided: onehot_encoding

Response variable: response_time_days


In [3]:
import pandas as pd
import duckdb
import pickle
import os

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline


### Define columns for each encoding type

In [4]:
onehot_cols = [
    "product",
    "company",
    "submitted_via",
    "consumer_consent_provided"
]

target_cols = [
    "sub_product",
    "issue",
    "state"
]


### Build the ColumnTransformer

In [5]:
# One-hot encoding 
onehot_transformer = Pipeline(steps = [
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Target encoding
target_transformer = Pipeline(steps = [
    ('target',TargetEncoder())
])

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", onehot_transformer, onehot_cols),
        ("te", target_transformer, target_cols)
    ],
    remainder="passthrough"
)

### Fit + Transform

In [7]:
import joblib
import pandas as pd
import os

# Transform
X_train_transformed = preprocessor.fit_transform(X_train, y_train)
print(f"Shape X_train: {X_train_transformed.shape}")

X_test_transformed = preprocessor.transform(X_test)
print(f"Shape X_test: {X_test_transformed.shape}")

feature_names = preprocessor.get_feature_names_out()

# Check if it's a sparse matrix
from scipy import sparse
if sparse.issparse(X_train_transformed):
    print("⚠️ X_train_transformed is sparse matrix!")
    X_train_transformed = X_train_transformed.toarray()
    X_test_transformed = X_test_transformed.toarray()
    print(f"Converted to dense array. New shape: {X_train_transformed.shape}")

X_train_final_df = pd.DataFrame(
    X_train_transformed,
    columns=feature_names
)
X_test_final_df = pd.DataFrame(
    X_test_transformed,
    columns=feature_names
)

Shape X_train: (203816, 36)
Shape X_test: (87351, 36)
⚠️ X_train_transformed is sparse matrix!
Converted to dense array. New shape: (203816, 36)


In [8]:
# Transform y for classification
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Fit and transform the training target, transform the test target
# This converts 'Yes'/'No' (or other categories) into 1/0
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

y_train = pd.DataFrame(y_train_encoded, columns=['company_response'])
y_test = pd.DataFrame(y_test_encoded, columns=['company_response'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Save the pipeline + Train and Test data

In [9]:
os.makedirs("../src/models", exist_ok=True)
joblib.dump(preprocessor, "../src/models/preprocessor.pkl")

joblib.dump(le, "../src/models/target_label_encoder.pkl")

['../src/models/target_label_encoder.pkl']

In [10]:
import os
os.getcwd()

'/home/hoan/VSCode/projects/local_elt_pipeline/notebooks'

In [11]:
# Save X
con = duckdb.connect("../database/ML/X_train.duckdb")
con.execute("CREATE OR REPLACE TABLE X_train AS SELECT * FROM X_train_final_df")
con.close()

con = duckdb.connect("../database/ML/X_test.duckdb")
con.execute("CREATE OR REPLACE TABLE X_test AS SELECT * FROM X_test_final_df")
con.close()

# Save y
con = duckdb.connect("../database/ML/y_train.duckdb")
# Rename the pandas variable in the query context to avoid confusion
con.register('y_train_df', y_train)
con.execute("CREATE OR REPLACE TABLE y_train AS SELECT * FROM y_train_df")
con.close()

con = duckdb.connect("../database/ML/y_test.duckdb")
con.register('y_test_df', y_test)
con.execute("CREATE OR REPLACE TABLE y_test AS SELECT * FROM y_test_df")
con.close()