In [22]:
import pathlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# =================================================================
# CONFIGURATION
# =================================================================
ROOT = pathlib.Path(".")
DATA = ROOT / "data"
PROC = DATA / "processed"
PROC.mkdir(parents=True, exist_ok=True)

print("="*70)
print("COMPLETE EMBEDDING PIPELINE - TASK 1.1 & 1.2")
print("="*70)

# =================================================================
# LOAD DATA
# =================================================================
print("\n[1/8] Loading raw data...")

raw_path = DATA / "CPS_Data.csv"
assert raw_path.exists(), f"Expected raw file at {raw_path}"

df = pd.read_csv(raw_path)
print(f"‚úì Loaded: {raw_path.name}")
print(f"  Rows: {len(df)}, Columns: {len(df.columns)}")


# CLEAN PERCENTAGE-LIKE STRINGS (e.g., "96.0%") -------------------
for col in df.columns:
    if df[col].dtype == object:
        # If more than half the non-null values end with '%', treat as percentage
        sample = df[col].dropna().astype(str)
        if len(sample) == 0:
            continue
        frac_percent = (sample.str.strip().str.endswith('%')).mean()
        if frac_percent > 0.5:
            # Remove '%' and convert to float
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.replace('%', '', regex=False)
            )

# =================================================================
# TASK 1.1: CONSTRUCT EMBEDDINGS
# =================================================================
print("\n" + "="*70)
print("TASK 1.1: CONSTRUCTING EMBEDDINGS")
print("="*70)

# Find school ID column
school_col = None
for col in ['School ID', 'school', 'name', 'id']:
    if col in df.columns:
        school_col = col
        break

if not school_col:
    raise ValueError("No school ID column found")

# Standardize school ID to string
df['school'] = df[school_col].astype(str)
print(f"\n[2/8] School identifier: '{school_col}' ‚Üí 'school'")

# Find and map level column
level_col = None
for col in ['Elementary, Middle, or High School', 'level', 'type']:
    if col in df.columns:
        level_col = col
        break

if level_col:
    # Map to ES/MS/HS
    df['level'] = df[level_col].astype(str).apply(lambda x:
        'ES' if 'elem' in x.lower() or x.lower() == 'es' else
        'MS' if 'middle' in x.lower() or x.lower() == 'ms' else
        'HS' if 'high' in x.lower() or x.lower() == 'hs' else x
    )
    print(f"‚úì Level column: '{level_col}' ‚Üí {df['level'].unique()}")

# =================================================================
# SELECT NUMERICAL FEATURES
# =================================================================
print(f"\n[3/8] Selecting numerical features...")

# Exclude non-feature columns
exclude = [school_col, level_col, 'school', 'level', 'School ID', 'Name of School',
           'Latitude', 'Longitude', 'X_COORDINATE', 'Y_COORDINATE',
           'Street Address', 'City', 'State', 'ZIP Code', 'Phone Number',
           'RCDTS Code', 'Ward', 'Police District', 'Community Area Number',
           'College Enrollment (number of students)', 'Location', 'Link']

# Get numerical columns
num_cols = []
for col in df.columns:
    if col in exclude:
        continue
    # Try to convert to numeric
    try:
        test = pd.to_numeric(df[col], errors='coerce')
        if test.notna().sum() > len(df) * 0.5:  # At least 50% non-null
            num_cols.append(col)
    except:
        pass

print(f"‚úì Found {len(num_cols)} numerical features")

# Convert to numeric and clean
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# =================================================================
# CREATE EMBEDDINGS DATAFRAME
# =================================================================
print(f"\n[4/8] Creating embeddings...")

embeddings = pd.DataFrame()
embeddings['school'] = df['school']

# Add numerical features
for col in num_cols:
    embeddings[col] = df[col].fillna(df[col].median())

# One-hot encode level
if 'level' in df.columns:
    for lvl in ['ES', 'MS', 'HS']:
        embeddings[f'level_{lvl}'] = (df['level'] == lvl).astype(int)

# Create aggregated features
feature_map = {
    'safety': ['Safety Score', 'safety'],
    'attendance': ['Average Student Attendance', 'attendance'],
    'misconduct': ['Rate of Misconducts (per 100 students)', 'Rate of Misconducts (per 100 students) ', 'misconduct'],
    'instr': ['Instruction Score', 'instr'],
    'teachers': ['Teachers Score', 'teachers'],
    'leaders': ['Leaders Score', 'leaders']
}

found_features = {}
for short_name, possible_names in feature_map.items():
    for full_name in possible_names:
        if full_name in df.columns:
            found_features[short_name] = full_name
            break

# Behavioral composite
if 'safety' in found_features and 'misconduct' in found_features:
    vals = (df[found_features['safety']].fillna(df[found_features['safety']].median()) -
            df[found_features['misconduct']].fillna(df[found_features['misconduct']].median()) * 10)
    embeddings['behavioral_composite'] = vals.fillna(0)

print(f"‚úì Embedding shape: {embeddings.shape}")

# =================================================================
# NORMALIZE AND SCALE
# =================================================================
print(f"\n[5/8] Normalizing embeddings...")

# Separate ID and features
school_ids = embeddings['school'].values
X = embeddings.drop(columns=['school']).values

# Fill any remaining NaN with 0
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"‚úì Standardized to mean=0, std=1")
print(f"  Shape: {X_scaled.shape}")

# Save full embeddings (TASK 1.1 OUTPUT)
emb_full = pd.DataFrame(X_scaled, columns=[f'feat_{i}' for i in range(X_scaled.shape[1])])
emb_full.insert(0, 'school', school_ids)
if 'level' in df.columns:
    emb_full.insert(1, 'level', df['level'].values)

# Add original features
for short_name, full_name in found_features.items():
    emb_full[short_name] = df[full_name].fillna(df[full_name].median()).values

emb_full_path = PROC / "embeddings_cps.csv"
emb_full.to_csv(emb_full_path, index=False)
print(f"\n‚úì TASK 1.1 OUTPUT: {emb_full_path.name}")
print(f"  Rows: {len(emb_full)}, Columns: {len(emb_full.columns)}")

# =================================================================
# TASK 1.2: PCA PROJECTION TO 2D
# =================================================================
print("\n" + "="*70)
print("TASK 1.2: PROJECTING TO 2D WITH PCA")
print("="*70)

print(f"\n[6/8] Applying PCA...")

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f"‚úì PCA complete")
print(f"  PC1 variance: {pca.explained_variance_ratio_[0]*100:.2f}%")
print(f"  PC2 variance: {pca.explained_variance_ratio_[1]*100:.2f}%")
print(f"  Total: {pca.explained_variance_ratio_.sum()*100:.2f}%")

# Create 2D embeddings dataframe
emb_2d = pd.DataFrame({
    'school': school_ids,
    'x': X_pca[:, 0],
    'y': X_pca[:, 1]
})

if 'level' in df.columns:
    emb_2d['level'] = df['level'].values

# Add original features with SHORT names for dashboard
for short_name, full_name in found_features.items():
    emb_2d[short_name] = df[full_name].fillna(df[full_name].median()).values

# Save 2D embeddings (TASK 1.2 OUTPUT)
emb_2d_path = PROC / "embeddings_cps_2d.csv"
emb_2d.to_csv(emb_2d_path, index=False)
print(f"\n‚úì TASK 1.2 OUTPUT: {emb_2d_path.name}")
print(f"  Rows: {len(emb_2d)}, Columns: {len(emb_2d.columns)}")

# =================================================================
# ENHANCEMENT: ADD OUTLIER SCORES, CLUSTERS, ETC.
# =================================================================
print("\n" + "="*70)
print("ENHANCEMENTS")
print("="*70)

print(f"\n[7/8] Adding outlier scores...")

# Use available features for outlier detection
outlier_features = [c for c in ['safety', 'misconduct', 'instr'] if c in emb_2d.columns]
if len(outlier_features) == 0:
    outlier_features = ['x', 'y']

X_outlier = emb_2d[outlier_features].fillna(0).values
X_outlier_scaled = StandardScaler().fit_transform(X_outlier)

nbrs = NearestNeighbors(n_neighbors=min(11, len(emb_2d)))
nbrs.fit(X_outlier_scaled)
distances, _ = nbrs.kneighbors(X_outlier_scaled)

outlier_scores = distances[:, 1:].mean(axis=1)
outlier_min, outlier_max = outlier_scores.min(), outlier_scores.max()
if outlier_max > outlier_min:
    emb_2d['outlier_score'] = ((outlier_scores - outlier_min) / (outlier_max - outlier_min) * 100)
else:
    emb_2d['outlier_score'] = 50.0

print(f"‚úì Outlier scores calculated")

# Clusters
print(f"Adding cluster labels...")
try:
    emb_2d['cluster_x'] = pd.qcut(emb_2d['x'], q=3, labels=['Low', 'Mid', 'High'], duplicates='drop')
    emb_2d['cluster_y'] = pd.qcut(emb_2d['y'], q=3, labels=['Low', 'Mid', 'High'], duplicates='drop')
    emb_2d['cluster'] = emb_2d['cluster_x'].astype(str) + '-' + emb_2d['cluster_y'].astype(str)
    print(f"‚úì Created {emb_2d['cluster'].nunique()} clusters")
except:
    emb_2d['cluster'] = 'Unknown'

# Behavior score
print(f"Calculating behavior score...")
if 'safety' in emb_2d.columns and 'misconduct' in emb_2d.columns:
    behavior = emb_2d['safety'] - emb_2d['misconduct'] * 10
    if 'attendance' in emb_2d.columns:
        behavior = behavior + emb_2d['attendance']

    b_min, b_max = behavior.min(), behavior.max()
    if b_max > b_min:
        emb_2d['behavior_score'] = ((behavior - b_min) / (b_max - b_min) * 100)
    else:
        emb_2d['behavior_score'] = 50.0
    print(f"‚úì Behavior score: {emb_2d['behavior_score'].min():.1f} to {emb_2d['behavior_score'].max():.1f}")
else:
    emb_2d['behavior_score'] = 50.0

# Academic score
if 'instr' in emb_2d.columns:
    emb_2d['academic_score'] = emb_2d['instr'].fillna(emb_2d['instr'].median())
    print(f"‚úì Academic score added")
else:
    emb_2d['academic_score'] = 50.0

# Deviations by level
if 'level' in emb_2d.columns:
    for col in ['safety', 'instr', 'misconduct']:
        if col in emb_2d.columns:
            emb_2d[f'{col}_dev'] = emb_2d[col] - emb_2d.groupby('level')[col].transform('mean')

# Save enhanced 2D embeddings
emb_2d_enh_path = PROC / "embeddings_cps_2d_enhanced.csv"
emb_2d.to_csv(emb_2d_enh_path, index=False)
print(f"\n‚úì ENHANCED OUTPUT: {emb_2d_enh_path.name}")
print(f"  Rows: {len(emb_2d)}, Columns: {len(emb_2d.columns)}")

# =================================================================
# SPATIAL DATA ENRICHMENT
# =================================================================
print(f"\n[8/8] Processing spatial data...")

spatial_path = PROC / "cps_spatial.csv"
if spatial_path.exists():
    spatial = pd.read_csv(spatial_path)

    # Ensure school IDs match types
    spatial['school'] = spatial['school'].astype(str)

    # Merge
    merge_cols = ['school', 'cluster', 'outlier_score', 'behavior_score', 'academic_score']
    merge_cols = [c for c in merge_cols if c in emb_2d.columns]

    spatial_enh = spatial.merge(emb_2d[merge_cols], on='school', how='left')

    spatial_enh_path = PROC / "cps_spatial_enhanced.csv"
    spatial_enh.to_csv(spatial_enh_path, index=False)
    print(f"‚úì SPATIAL OUTPUT: {spatial_enh_path.name}")
    print(f"  Rows: {len(spatial_enh)}, Columns: {len(spatial_enh.columns)}")
else:
    print("‚ö†Ô∏è  No spatial file found, skipping")

# =================================================================
# SUMMARY
# =================================================================
print("\n" + "="*70)
print("COMPLETE!")
print("="*70)

print(f"\nüìÅ Output files in: {PROC.absolute()}")
print(f"\n‚úÖ TASK 1.1: embeddings_cps.csv")
print(f"   - High-dimensional embeddings with {X_scaled.shape[1]} features")
print(f"\n‚úÖ TASK 1.2: embeddings_cps_2d.csv")
print(f"   - 2D PCA projection (x, y coordinates)")
print(f"   - Method: PCA")
print(f"   - Variance explained: {pca.explained_variance_ratio_.sum()*100:.2f}%")
print(f"\n‚úÖ ENHANCED: embeddings_cps_2d_enhanced.csv")
print(f"   - All 2D data plus outlier_score, cluster, behavior_score, etc.")

if 'level' in emb_2d.columns:
    print(f"\nüìä Summary by level:")
    summary_cols = [c for c in ['safety', 'misconduct', 'instr', 'behavior_score'] if c in emb_2d.columns]
    if summary_cols:
        print(emb_2d.groupby('level')[summary_cols].mean().round(1))

print(f"\nüéØ Next: Copy enhanced files to web/data/ folder")
print(f"   cp {emb_2d_enh_path} web/data/")
if spatial_path.exists():
    print(f"   cp {spatial_enh_path} web/data/")

print("\n‚úÖ Pipeline complete!")

COMPLETE EMBEDDING PIPELINE - TASK 1.1 & 1.2

[1/8] Loading raw data...
‚úì Loaded: CPS_Data.csv
  Rows: 566, Columns: 79

TASK 1.1: CONSTRUCTING EMBEDDINGS

[2/8] School identifier: 'School ID' ‚Üí 'school'
‚úì Level column: 'Elementary, Middle, or High School' ‚Üí ['ES' 'HS' 'MS']

[3/8] Selecting numerical features...
‚úì Found 30 numerical features

[4/8] Creating embeddings...
‚úì Embedding shape: (566, 35)

[5/8] Normalizing embeddings...
‚úì Standardized to mean=0, std=1
  Shape: (566, 34)

‚úì TASK 1.1 OUTPUT: embeddings_cps.csv
  Rows: 566, Columns: 41

TASK 1.2: PROJECTING TO 2D WITH PCA

[6/8] Applying PCA...
‚úì PCA complete
  PC1 variance: 29.18%
  PC2 variance: 8.34%
  Total: 37.52%

‚úì TASK 1.2 OUTPUT: embeddings_cps_2d.csv
  Rows: 566, Columns: 9

ENHANCEMENTS

[7/8] Adding outlier scores...
‚úì Outlier scores calculated
Adding cluster labels...
‚úì Created 9 clusters
Calculating behavior score...
‚úì Behavior score: 0.0 to 100.0
‚úì Academic score added

‚úì ENHANCED 