# 🚀 Simple ML Template - Clean Data Fast

## Purpose
Quick template to get from raw data to clean, model-ready data using our preprocessing classes.

**Steps:**
1. Configure preprocessing parameters
2. Run ProductionCarMatcher + DataPreprocessor  
3. Get clean X_train, X_test, y_train, y_test
4. Start modeling!

In [1]:
import pandas as pd
import numpy as np
import os
import json
import kagglehub
import warnings
warnings.filterwarnings('ignore')

# Load preprocessing classes
os.chdir('/Users/leonardodicaterina/Documents/GitHub/ML_group_45')
from utils.preprocessing.CarDatabase import ProductionCarMatcher, create_optimized_database
from utils.preprocessing.Preprocessor_divided import DataPreprocessor

print(" SIMPLE ML TEMPLATE - FAST CLEAN DATA")
print("=" * 45)

# CONFIGURATION - Change these as needed
CONFIG = {
    'random_state': 42,
    'target_column': 'price',
    
    # Preprocessing options
    'outlier_removal': True,          # Remove outliers?
    'feature_scaling': True,          # Scale features?
    'encoding_method': 'mean',        # 'mean', 'onehot'
    'missing_strategy': 'smart',      # 'smart', 'median', 'mean'
    
    # Feature transformations
    'log_transform_mileage': True,    # Log transform mileage?
    'convert_year_to_age': True,      # Convert year to car age?
    'normalize_paint': True,          # Normalize paint quality to 0-1?
    'normalize_engine': True,         # Convert engine size units?
}

print(f" Configuration: {CONFIG}")

  from .autonotebook import tqdm as notebook_tqdm


 SIMPLE ML TEMPLATE - FAST CLEAN DATA
 Configuration: {'random_state': 42, 'target_column': 'price', 'outlier_removal': True, 'feature_scaling': True, 'encoding_method': 'mean', 'missing_strategy': 'smart', 'log_transform_mileage': True, 'convert_year_to_age': True, 'normalize_paint': True, 'normalize_engine': True}


In [2]:
import pandas as pd
import numpy as np
import os
import json
import kagglehub
import warnings
warnings.filterwarnings('ignore')

# Load preprocessing classes
os.chdir('/Users/leonardodicaterina/Documents/GitHub/ML_group_45')
from utils.preprocessing.CarDatabase import ProductionCarMatcher, create_optimized_database
from utils.preprocessing.Preprocessor_divided import DataPreprocessor

print("🚀 SIMPLE ML TEMPLATE - FAST CLEAN DATA")
print("=" * 45)

# CONFIGURATION - Change these as needed
CONFIG = {
    'random_state': 42,
    'target_column': 'price',
    
    # Preprocessing options
    'outlier_removal': True,          # Remove outliers?
    'feature_scaling': True,          # Scale features?
    'encoding_method': 'mean',        # 'mean', 'onehot'
    'missing_strategy': 'smart',      # 'smart', 'median', 'mean'
    
    # Feature transformations
    'log_transform_mileage': True,    # Log transform mileage?
    'convert_year_to_age': True,      # Convert year to car age?
    'normalize_paint': True,          # Normalize paint quality to 0-1?
    'normalize_engine': True,         # Convert engine size units?
}

print(f"📋 Configuration: {CONFIG}")

🚀 SIMPLE ML TEMPLATE - FAST CLEAN DATA
📋 Configuration: {'random_state': 42, 'target_column': 'price', 'outlier_removal': True, 'feature_scaling': True, 'encoding_method': 'mean', 'missing_strategy': 'smart', 'log_transform_mileage': True, 'convert_year_to_age': True, 'normalize_paint': True, 'normalize_engine': True}


In [3]:
def load_and_preprocess_data(config=CONFIG):
    """
    Complete pipeline: raw data → clean data ready for ML
    """
    
    print("\n🔄 STEP 1: Loading raw data...")
    # Load datasets
    train_data = pd.read_csv('Data/train.csv')
    test_data = pd.read_csv('Data/test.csv')
    print(f"   Train: {train_data.shape}, Test: {test_data.shape}")
    
    print("\n🔄 STEP 2: Brand standardization...")
    # Download car database
    try:
        path = kagglehub.dataset_download("bourzamraid/global-car-make-and-model-list")
        with open(os.path.join(path, 'vehicle models.json'), 'r') as f:
            kaggle_data = json.load(f)
        
        # Create matcher and clean brands
        optimized_db = create_optimized_database(kaggle_data, max_model_words=3)
        matcher = ProductionCarMatcher(optimized_db)
        
        # Clean brands
        train_data['Brand_cleaned'] = [matcher.find_best_make_match(brand)[0] for brand in train_data['Brand']]
        test_data['Brand_cleaned'] = [matcher.find_best_make_match(brand)[0] for brand in test_data['Brand']]
        
        print(f"   ✅ Brands: {train_data['Brand'].nunique()} → {train_data['Brand_cleaned'].nunique()}")
    except:
        print("   ⚠️ Using original brands (database download failed)")
        train_data['Brand_cleaned'] = train_data['Brand']
        test_data['Brand_cleaned'] = test_data['Brand']
    
    print("\n🔄 STEP 3: Feature preprocessing...")
    
    # Create preprocessor
    preprocessor = DataPreprocessor(target_column=config['target_column'])
    
    # Define transformations
    def log_transform(x):
        return np.log1p(np.abs(x)) if config['log_transform_mileage'] else x
    
    def year_to_age(year_series):
        return 2025 - year_series if config['convert_year_to_age'] else year_series
    
    def normalize_paint(paint_series):
        return np.clip(paint_series, 0, 100) / 100 if config['normalize_paint'] else paint_series
    
    def normalize_engine(engine_series):
        if config['normalize_engine']:
            result = engine_series.copy()
            small_engines = engine_series <= 10
            result[small_engines] = engine_series[small_engines] * 1000
            return result
        return engine_series
    
    # Configure features
    feature_configs = {
        'mileage': {
            'missing_strategy': 'median',
            'outlier_method': 'iqr' if config['outlier_removal'] else None,
            'transform_func': log_transform if config['log_transform_mileage'] else None,
            'scaling_method': 'standard' if config['feature_scaling'] else None
        },
        'year': {
            'missing_strategy': 'mean',
            'transform_func': year_to_age if config['convert_year_to_age'] else None,
            'scaling_method': 'minmax' if config['feature_scaling'] else None
        },
        'engineSize': {
            'missing_strategy': 1600,
            'transform_func': normalize_engine if config['normalize_engine'] else None,
            'scaling_method': 'robust' if config['feature_scaling'] else None
        },
        'tax': {
            'missing_strategy': 'median',
            'scaling_method': 'standard' if config['feature_scaling'] else None
        },
        'mpg': {
            'missing_strategy': 'median',
            'outlier_method': 'iqr' if config['outlier_removal'] else None,
            'scaling_method': 'standard' if config['feature_scaling'] else None
        },
        'paintQuality%': {
            'missing_strategy': 'median',
            'transform_func': normalize_paint if config['normalize_paint'] else None,
            'scaling_method': 'minmax' if config['feature_scaling'] else None
        },
        'previousOwners': {
            'missing_strategy': 'median',
            'transform_func': lambda x: np.log1p(np.abs(np.floor(x))),
            'scaling_method': 'standard' if config['feature_scaling'] else None
        },
        'hasDamage': {
            'missing_strategy': 1,
            'scaling_method': None
        },
        'Brand_cleaned': {
            'missing_strategy': 'mode',
            'encoding_method': config['encoding_method']
        },
        'transmission': {
            'missing_strategy': 'mode',
            'encoding_method': config['encoding_method']
        },
        'fuelType': {
            'missing_strategy': 'mode',
            'encoding_method': config['encoding_method']
        }
    }
    
    # Add features to preprocessor
    for feature, feature_config in feature_configs.items():
        if feature in train_data.columns:
            preprocessor.add_feature_pipeline(feature, **feature_config)
    
    # Fit and transform
    preprocessor.fit(train_data)
    X_train, y_train = preprocessor.transform(train_data)
    X_test, y_test = preprocessor.transform(test_data)
    
    print(f"   ✅ Features: {len(feature_configs)} → {X_train.shape[1]}")
    print(f"   ✅ Missing values: {X_train.isnull().sum().sum()}")
    
    return X_train, X_test, y_train, y_test, preprocessor

# Run preprocessing
X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data()

print(f"\n✅ CLEAN DATA READY!")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   Features: {list(X_train.columns)}")
print(f"\n🚀 Ready for modeling!")


🔄 STEP 1: Loading raw data...
   Train: (75973, 14), Test: (32567, 13)

🔄 STEP 2: Brand standardization...
Analyzing word frequencies...
Created optimized database with 146 makes
   ✅ Brands: 72 → 12

🔄 STEP 3: Feature preprocessing...
✓ Fitting pipeline for 'mileage'
✓ Fitting pipeline for 'year'
✓ Fitting pipeline for 'engineSize'
✓ Fitting pipeline for 'tax'
✓ Fitting pipeline for 'mpg'
✓ Fitting pipeline for 'paintQuality%'
✓ Fitting pipeline for 'previousOwners'
✓ Fitting pipeline for 'hasDamage'
✓ Fitting pipeline for 'Brand_cleaned'
✓ Fitting pipeline for 'transmission'
✓ Fitting pipeline for 'fuelType'
✓ Transforming 'mileage'
✓ Transforming 'year'
✓ Transforming 'engineSize'
✓ Transforming 'tax'
✓ Transforming 'mpg'
✓ Transforming 'paintQuality%'
✓ Transforming 'previousOwners'
✓ Transforming 'hasDamage'
✓ Transforming 'Brand_cleaned'
✓ Transforming 'transmission'
✓ Transforming 'fuelType'
✓ Transforming 'mileage'
✓ Transforming 'year'
✓ Transforming 'engineSize'
✓ Transformi

In [5]:
# Quick data validation
print("🔍 QUICK VALIDATION")
print("=" * 20)

print(f"✅ Data shapes match: {X_train.shape[1] == X_test.shape[1]}")
print(f"✅ No missing values: {X_train.isnull().sum().sum() == 0}")
print(f"✅ Target is numeric: {y_train.dtype in ['int64', 'float64']}")
print(f"✅ Features are numeric: {X_train.select_dtypes(include=[np.number]).shape[1] == X_train.shape[1]}")

# Quick stats
print(f"\n📊 Quick Stats:")
print(f"   Samples: {len(X_train):,}")
print(f"   Features: {X_train.shape[1]}")
print(f"   Target range: [{y_train.min():.0f}, {y_train.max():.0f}]")


🔍 QUICK VALIDATION
✅ Data shapes match: True
✅ No missing values: False
✅ Target is numeric: True
✅ Features are numeric: False

📊 Quick Stats:
   Samples: 75,973
   Features: 14
   Target range: [450, 159999]


## 🎯 Data is Clean - Start Modeling!

Your data is now clean and ready. Here's what you have:

- **X_train, X_test**: Feature matrices (scaled, encoded, no missing values)
- **y_train, y_test**: Target vectors  
- **preprocessor**: Fitted preprocessor (for new data)

**Next steps:** Add your ML models below!

In [10]:
# EXAMPLE: Quick model test
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# split train and valdation
X_train_, X_val_ = X_train.iloc[:int(0.8*len(X_train))], X_train.iloc[int(0.8*len(X_train)):]
y_train_ , y_val_ = y_train.iloc[:int(0.8*len(y_train))], y_train.iloc[int(0.8*len(y_train)):]
X_train_ = X_train_[numerical_features]
X_val_ = X_val_[numerical_features]
# Quick model test
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_, y_train_)

predictions = model.predict(X_val_)
rmse = np.sqrt(mean_squared_error(y_val_, predictions))
r2 = r2_score(y_val_, predictions)

print(f"🔥 QUICK MODEL TEST:")
print(f"   Random Forest RMSE: {rmse:.0f}")
print(f"   Random Forest R²: {r2:.3f}")
print(f"\n👆 Replace this with your models!")

🔥 QUICK MODEL TEST:
   Random Forest RMSE: 2832
   Random Forest R²: 0.917

👆 Replace this with your models!


In [9]:
predictions 


array([19393.22, 22720.85, 13948.53, ..., 34663.54, 19656.24, 14220.67])