In [3]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition        import TruncatedSVD
from sentence_transformers        import SentenceTransformer
from sklearn.preprocessing       import OneHotEncoder, StandardScaler
from scipy.sparse                import hstack, csr_matrix


In [4]:
# Configuration & Model Initialization

# Paths
PROJECT_ROOT   = Path.cwd().parent
CLEAN_DIR      = PROJECT_ROOT / "preprocessed_data"
OUT_DIR        = PROJECT_ROOT / "feature_matrices"
OUT_DIR.mkdir(exist_ok=True)

# Feature settings
TFIDF_MAX_FEAT = 10_000
TFIDF_NGRAMS   = (1, 2)
SVD_COMPONENTS = 300
EMBED_MODEL    = "all-MiniLM-L6-v2"

NUM_COLS = [
    "goal",
    "pledged",
    "usd_goal_real",
    "usd_pledged_real",
    "campaign_duration_days",
]
CAT_COLS = ["main_category", "state"]

# Instantiate transformers
tfidf    = TfidfVectorizer(max_features=TFIDF_MAX_FEAT,
                           ngram_range=TFIDF_NGRAMS,
                           stop_words="english")
svd      = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=42)
embedder = SentenceTransformer(EMBED_MODEL)
scaler   = StandardScaler()
#  Use `sparse_output=True` instead of `sparse=True`
ohe      = OneHotEncoder(handle_unknown="ignore", sparse_output=True)


In [6]:
# Automatic Numeric & Categorical Selection

import pandas as pd
from scipy.sparse import csr_matrix

# Inspect loaded df columns
print("All columns:", df.columns.tolist())

# 1) Identify text column we vectorized
text_col = "blurb" if "blurb" in df.columns else "name"

# 2) Numeric columns: any pandas numeric dtype, except 'id'
num_cols = [
    c for c in df.select_dtypes(include=["number"]).columns
    if c not in ("id",)
]
print("Numeric columns detected:", num_cols)

# 3) Categorical columns: any object dtype, excluding the text_col
cat_cols = [
    c for c in df.select_dtypes(include=["object"]).columns
    if c != text_col
]
print("Categorical columns detected:", cat_cols)

# 4) Scale numerics
if num_cols:
    X_num = scaler.fit_transform(df[num_cols])
    print("  • Numeric matrix shape:", X_num.shape)
else:
    X_num = np.empty((len(df), 0))
    print("  • No numeric columns.")

# 5) One-hot encode categoricals
if cat_cols:
    X_cat = ohe.fit_transform(df[cat_cols])
    print("  • Categorical matrix shape:", X_cat.shape)
else:
    X_cat = csr_matrix((len(df), 0))
    print("  • No categorical columns.")


All columns: ['id', 'name', 'category', 'main_category', 'currency', 'deadline', 'goal', 'launched', 'pledged', 'state', 'backers', 'country', 'usd_pledged']
Numeric columns detected: ['goal', 'pledged']
Categorical columns detected: ['category', 'main_category', 'currency', 'deadline', 'launched', 'state', 'backers', 'country', 'usd_pledged']
  • Numeric matrix shape: (323750, 2)
  • Categorical matrix shape: (323750, 717078)


In [7]:
# Combine SVD, Embeddings, Numeric & Categorical into X_all

# 1) Sparse block: SVD components (dense→sparse) + one-hot cats
X_sparse = hstack([csr_matrix(X_svd), X_cat]).tocsr()

# 2) Dense block: numeric + embeddings → convert to sparse
X_dense_sparse = csr_matrix(np.hstack([X_num, X_emb]))

# 3) Final concatenation
X_all = hstack([X_sparse, X_dense_sparse]).tocsr()
print("Combined feature matrix shape:", X_all.shape)


Combined feature matrix shape: (323750, 717764)
