In [1]:
# Feature Engineering for Air Quality Prediction
# This notebook creates advanced features for time-series forecasting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline

print("=" * 70)
print("FEATURE ENGINEERING - AIR QUALITY PREDICTION")
print("=" * 70)

FEATURE ENGINEERING - AIR QUALITY PREDICTION


In [2]:
# =============================================================================
# 1. LOAD PROCESSED DATA
# =============================================================================

print("\nüìÇ LOADING DATA...")
print("-" * 70)

# Load corrected data with temporal features
data_path = Path('../data/processed/air_quality_with_features.csv')

if not data_path.exists():
    # Fallback to corrected historical data
    data_path = Path('../data/processed/corrected_air_quality_historical_20251129.csv')

df = pd.read_csv(data_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"‚úì Loaded {len(df):,} records")
print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"  Columns: {len(df.columns)}")

# Sort by city and timestamp for proper lag calculations
df = df.sort_values(['city_name', 'timestamp']).reset_index(drop=True)

print("‚úì Data sorted by city and timestamp")


üìÇ LOADING DATA...
----------------------------------------------------------------------
‚úì Loaded 22,556 records
  Date range: 2025-08-31 18:00:00 to 2025-12-03 16:00:00
  Columns: 31
‚úì Data sorted by city and timestamp


In [3]:
# =============================================================================
# 2. CREATE LAG FEATURES (Historical Values)
# =============================================================================

print("\n" + "=" * 70)
print("üîô CREATING LAG FEATURES")
print("=" * 70)

# Define lag periods (in hours)
lag_hours = [1, 3, 6, 12, 24, 48, 72, 168]  # 1h to 1 week

# Features to create lags for
lag_features = ['aqi', 'pm25', 'pm10', 'temperature', 'humidity', 
                'pressure', 'wind_speed']

lag_features_available = [f for f in lag_features if f in df.columns]

print(f"Creating lag features for: {lag_features_available}")
print(f"Lag periods: {lag_hours} hours")

# Create lag features by city (to avoid cross-city contamination)
for feature in lag_features_available:
    for lag in lag_hours:
        col_name = f'{feature}_lag_{lag}h'
        df[col_name] = df.groupby('city_name')[feature].shift(lag)
        
print(f"\n‚úì Created {len(lag_features_available) * len(lag_hours)} lag features")

# Check how many records have complete lag data
complete_mask = df[[f'{f}_lag_{lag_hours[-1]}h' for f in lag_features_available]].notna().all(axis=1)
print(f"‚úì Records with complete lag data: {complete_mask.sum():,} ({complete_mask.sum()/len(df)*100:.1f}%)")



üîô CREATING LAG FEATURES
Creating lag features for: ['aqi', 'pm25', 'pm10', 'temperature', 'humidity', 'pressure', 'wind_speed']
Lag periods: [1, 3, 6, 12, 24, 48, 72, 168] hours

‚úì Created 56 lag features
‚úì Records with complete lag data: 20,876 (92.6%)


In [4]:
# =============================================================================
# 3. CREATE ROLLING STATISTICS
# =============================================================================

print("\n" + "=" * 70)
print("üìä CREATING ROLLING STATISTICS")
print("=" * 70)

# Define rolling windows (in hours)
windows = [3, 6, 12, 24, 48, 72]

# Statistics to calculate
stats = {
    'mean': 'mean',
    'std': 'std',
    'min': 'min',
    'max': 'max'
}

# Features for rolling stats
rolling_features = ['aqi', 'pm25', 'pm10', 'temperature', 'humidity']
rolling_features_available = [f for f in rolling_features if f in df.columns]

print(f"Creating rolling statistics for: {rolling_features_available}")
print(f"Windows: {windows} hours")
print(f"Statistics: {list(stats.keys())}")

feature_count = 0
for feature in rolling_features_available:
    for window in windows:
        for stat_name, stat_func in stats.items():
            col_name = f'{feature}_rolling_{window}h_{stat_name}'
            df[col_name] = df.groupby('city_name')[feature].transform(
                lambda x: x.rolling(window=window, min_periods=1).agg(stat_func)
            )
            feature_count += 1

print(f"\n‚úì Created {feature_count} rolling statistics features")


üìä CREATING ROLLING STATISTICS
Creating rolling statistics for: ['aqi', 'pm25', 'pm10', 'temperature', 'humidity']
Windows: [3, 6, 12, 24, 48, 72] hours
Statistics: ['mean', 'std', 'min', 'max']

‚úì Created 120 rolling statistics features


In [5]:
# =============================================================================
# 4. CREATE RATE OF CHANGE FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üìà CREATING RATE OF CHANGE FEATURES")
print("=" * 70)

# Time periods for rate of change
change_periods = [1, 3, 6, 12, 24]

change_features = ['aqi', 'pm25', 'temperature', 'pressure']
change_features_available = [f for f in change_features if f in df.columns]

print(f"Creating rate of change for: {change_features_available}")
print(f"Periods: {change_periods} hours")

for feature in change_features_available:
    for period in change_periods:
        # Absolute change
        col_name = f'{feature}_change_{period}h'
        df[col_name] = df.groupby('city_name')[feature].diff(period)
        
        # Percentage change
        col_name_pct = f'{feature}_pct_change_{period}h'
        df[col_name_pct] = df.groupby('city_name')[feature].pct_change(period) * 100

print(f"\n‚úì Created {len(change_features_available) * len(change_periods) * 2} rate of change features")



üìà CREATING RATE OF CHANGE FEATURES
Creating rate of change for: ['aqi', 'pm25', 'temperature', 'pressure']
Periods: [1, 3, 6, 12, 24] hours

‚úì Created 40 rate of change features


In [6]:
# =============================================================================
# 5. CREATE TEMPORAL FEATURES (CYCLICAL ENCODING)
# =============================================================================

print("\n" + "=" * 70)
print("‚è∞ CREATING CYCLICAL TEMPORAL FEATURES")
print("=" * 70)

# Extract temporal components if not already present
if 'hour' not in df.columns:
    df['hour'] = df['timestamp'].dt.hour
if 'day_of_week' not in df.columns:
    df['day_of_week'] = df['timestamp'].dt.dayofweek
if 'month' not in df.columns:
    df['month'] = df['timestamp'].dt.month
if 'day_of_year' not in df.columns:
    df['day_of_year'] = df['timestamp'].dt.dayofyear

# Cyclical encoding for hour (0-23)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Cyclical encoding for day of week (0-6)
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Cyclical encoding for month (1-12)
df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Cyclical encoding for day of year (1-365)
df['day_of_year_sin'] = np.sin(2 * np.pi * (df['day_of_year'] - 1) / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * (df['day_of_year'] - 1) / 365)

print("‚úì Created cyclical encodings:")
print("  - Hour (sin/cos)")
print("  - Day of week (sin/cos)")
print("  - Month (sin/cos)")
print("  - Day of year (sin/cos)")

# Additional temporal features
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19, 20]).astype(int)
df['is_night'] = df['hour'].isin(range(0, 6)).astype(int)
df['is_peak_pollution'] = df['hour'].isin([19, 20, 21]).astype(int)

print("‚úì Created categorical temporal features:")
print("  - is_weekend, is_rush_hour, is_night, is_peak_pollution")


‚è∞ CREATING CYCLICAL TEMPORAL FEATURES
‚úì Created cyclical encodings:
  - Hour (sin/cos)
  - Day of week (sin/cos)
  - Month (sin/cos)
  - Day of year (sin/cos)
‚úì Created categorical temporal features:
  - is_weekend, is_rush_hour, is_night, is_peak_pollution


In [7]:
# =============================================================================
# 6. CREATE WEATHER INTERACTION FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üå§Ô∏è CREATING WEATHER INTERACTION FEATURES")
print("=" * 70)

weather_features = ['temperature', 'humidity', 'pressure', 'wind_speed']
available_weather = [f for f in weather_features if f in df.columns and df[f].notna().sum() > 1000]

if len(available_weather) >= 2:
    # Temperature √ó Humidity
    if 'temperature' in available_weather and 'humidity' in available_weather:
        df['temp_humidity_interaction'] = df['temperature'] * df['humidity']
        print("‚úì Created: temp_humidity_interaction")
    
    # Temperature squared (non-linear effects)
    if 'temperature' in available_weather:
        df['temperature_squared'] = df['temperature'] ** 2
        print("‚úì Created: temperature_squared")
    
    # Humidity squared
    if 'humidity' in available_weather:
        df['humidity_squared'] = df['humidity'] ** 2
        print("‚úì Created: humidity_squared")
    
    # Wind chill approximation
    if 'temperature' in available_weather and 'wind_speed' in available_weather:
        df['wind_chill'] = df['temperature'] - (df['wind_speed'] * 0.5)
        print("‚úì Created: wind_chill")
    
    # Pressure change (indicates weather fronts)
    if 'pressure' in available_weather:
        df['pressure_change_3h'] = df.groupby('city_name')['pressure'].diff(3)
        print("‚úì Created: pressure_change_3h")
    
    # Comfort index (temperature + humidity)
    if 'temperature' in available_weather and 'humidity' in available_weather:
        df['comfort_index'] = df['temperature'] + (0.4 * df['humidity'])
        print("‚úì Created: comfort_index")

print(f"\n‚úì Created weather interaction features")


üå§Ô∏è CREATING WEATHER INTERACTION FEATURES
‚úì Created: temp_humidity_interaction
‚úì Created: temperature_squared
‚úì Created: humidity_squared
‚úì Created: wind_chill
‚úì Created: pressure_change_3h
‚úì Created: comfort_index

‚úì Created weather interaction features


In [8]:
# =============================================================================
# 7. CREATE POLLUTANT RATIOS
# =============================================================================

print("\n" + "=" * 70)
print("üè≠ CREATING POLLUTANT RATIO FEATURES")
print("=" * 70)

# PM2.5 to PM10 ratio (indicator of fine vs coarse particles)
if 'pm25' in df.columns and 'pm10' in df.columns:
    df['pm25_pm10_ratio'] = df['pm25'] / (df['pm10'] + 1e-6)  # Avoid division by zero
    print("‚úì Created: pm25_pm10_ratio")

# NO2 to O3 ratio (indicator of photochemical activity)
if 'no2' in df.columns and 'o3' in df.columns:
    df['no2_o3_ratio'] = df['no2'] / (df['o3'] + 1e-6)
    print("‚úì Created: no2_o3_ratio")

# Total particulate matter
if 'pm25' in df.columns and 'pm10' in df.columns:
    df['total_pm'] = df['pm25'] + df['pm10']
    print("‚úì Created: total_pm")


üè≠ CREATING POLLUTANT RATIO FEATURES
‚úì Created: pm25_pm10_ratio
‚úì Created: no2_o3_ratio
‚úì Created: total_pm


In [9]:
# =============================================================================
# 8. ENCODE CATEGORICAL FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üè∑Ô∏è ENCODING CATEGORICAL FEATURES")
print("=" * 70)

# Label encode city names
if 'city_name' in df.columns:
    le_city = LabelEncoder()
    df['city_encoded'] = le_city.fit_transform(df['city_name'])
    print(f"‚úì Encoded city_name: {len(le_city.classes_)} unique cities")
    
    # Save mapping
    city_mapping = dict(zip(le_city.classes_, le_city.transform(le_city.classes_)))
    print("  City encoding:", city_mapping)

# Label encode country
if 'country_code' in df.columns:
    le_country = LabelEncoder()
    df['country_encoded'] = le_country.fit_transform(df['country_code'])
    print(f"‚úì Encoded country_code: {len(le_country.classes_)} unique countries")

# One-hot encode AQI category (if exists)
if 'aqi_category' in df.columns:
    df_category_dummies = pd.get_dummies(df['aqi_category'], prefix='aqi_cat')
    df = pd.concat([df, df_category_dummies], axis=1)
    print(f"‚úì One-hot encoded aqi_category: {len(df_category_dummies.columns)} categories")


üè∑Ô∏è ENCODING CATEGORICAL FEATURES
‚úì Encoded city_name: 10 unique cities
  City encoding: {'Beijing': np.int64(0), 'Cairo': np.int64(1), 'Delhi': np.int64(2), 'London': np.int64(3), 'Los Angeles': np.int64(4), 'Mexico City': np.int64(5), 'Mumbai': np.int64(6), 'New York': np.int64(7), 'S√£o Paulo': np.int64(8), 'Tokyo': np.int64(9)}
‚úì Encoded country_code: 8 unique countries
‚úì One-hot encoded aqi_category: 6 categories


In [10]:
# =============================================================================
# 9. FEATURE SUMMARY
# =============================================================================

print("\n" + "=" * 70)
print("üìã FEATURE ENGINEERING SUMMARY")
print("=" * 70)

# Count features by type
feature_types = {
    'Original Features': len([c for c in df.columns if not any(x in c for x in ['lag', 'rolling', 'change', 'sin', 'cos', 'interaction', 'ratio', 'encoded', 'aqi_cat'])]),
    'Lag Features': len([c for c in df.columns if 'lag' in c]),
    'Rolling Statistics': len([c for c in df.columns if 'rolling' in c]),
    'Rate of Change': len([c for c in df.columns if 'change' in c]),
    'Cyclical Temporal': len([c for c in df.columns if 'sin' in c or 'cos' in c]),
    'Weather Interactions': len([c for c in df.columns if 'interaction' in c or 'squared' in c or 'chill' in c or 'comfort' in c]),
    'Pollutant Ratios': len([c for c in df.columns if 'ratio' in c or 'total_pm' in c]),
    'Encoded Features': len([c for c in df.columns if 'encoded' in c or 'aqi_cat_' in c])
}

print("\nFeature counts by type:")
for ftype, count in feature_types.items():
    print(f"  {ftype:.<30} {count:>4}")

print(f"\n{'Total columns:':<30} {len(df.columns):>4}")
print(f"{'Total records:':<30} {len(df):>7,}")

# Check for missing values
missing_summary = df.isnull().sum()
missing_pct = (missing_summary / len(df) * 100).round(2)
high_missing = missing_summary[missing_pct > 50]

if len(high_missing) > 0:
    print(f"\n‚ö†Ô∏è  Columns with >50% missing data: {len(high_missing)}")
    print("   (These will be dropped)")
else:
    print("\n‚úì No columns with excessive missing data")


üìã FEATURE ENGINEERING SUMMARY

Feature counts by type:
  Original Features.............   39
  Lag Features..................   56
  Rolling Statistics............  120
  Rate of Change................   40
  Cyclical Temporal.............    8
  Weather Interactions..........    5
  Pollutant Ratios..............    3
  Encoded Features..............    8

Total columns:                  275
Total records:                  22,556

‚ö†Ô∏è  Columns with >50% missing data: 4
   (These will be dropped)


In [11]:
# =============================================================================
# 10. VISUALIZE FEATURE IMPORTANCE (PRELIMINARY)
# =============================================================================

print("\n" + "=" * 70)
print("üìä PRELIMINARY FEATURE IMPORTANCE")
print("=" * 70)

# Select numeric features only
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

# Remove target and non-predictive features
exclude_features = ['aqi', 'timestamp', 'date', 'latitude', 'longitude', 
                   'hour', 'day_of_week', 'month', 'day_of_year']
feature_cols = [c for c in numeric_features if c not in exclude_features and not c.startswith('aqi_cat_')]

# Use complete cases for preliminary analysis
df_complete = df[feature_cols + ['aqi']].dropna()

print(f"Using {len(df_complete):,} complete records for importance analysis")
print(f"Analyzing {len(feature_cols)} features")

if len(df_complete) > 100 and len(feature_cols) > 0:
    # Sample if dataset is too large
    if len(df_complete) > 10000:
        df_sample = df_complete.sample(10000, random_state=42)
    else:
        df_sample = df_complete
    
    # Calculate mutual information
    X = df_sample[feature_cols]
    y = df_sample['aqi']
    
    mi_scores = mutual_info_regression(X, y, random_state=42)
    mi_scores = pd.Series(mi_scores, index=feature_cols).sort_values(ascending=False)
    
    # Plot top 20
    fig, ax = plt.subplots(figsize=(12, 8))
    mi_scores.head(20).sort_values().plot(kind='barh', ax=ax, color='steelblue')
    ax.set_title('Top 20 Features by Mutual Information', fontweight='bold', fontsize=14)
    ax.set_xlabel('Mutual Information Score')
    ax.set_ylabel('Feature')
    ax.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print("\n‚úì Top 10 most important features:")
    for i, (feature, score) in enumerate(mi_scores.head(10).items(), 1):
        print(f"  {i:2d}. {feature:<40} {score:.4f}")


üìä PRELIMINARY FEATURE IMPORTANCE
Using 0 complete records for importance analysis
Analyzing 251 features


In [12]:
# =============================================================================
# 11. HANDLE MISSING VALUES
# =============================================================================

print("\n" + "=" * 70)
print("üîß HANDLING MISSING VALUES")
print("=" * 70)

# Drop columns with >80% missing
threshold = 0.8
missing_pct = df.isnull().sum() / len(df)
cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()

if cols_to_drop:
    print(f"Dropping {len(cols_to_drop)} columns with >{threshold*100}% missing:")
    for col in cols_to_drop[:10]:  # Show first 10
        print(f"  - {col}")
    df = df.drop(columns=cols_to_drop)

# For lag features, forward fill within each city (up to 3 periods)
lag_cols = [c for c in df.columns if 'lag' in c or 'rolling' in c]
for col in lag_cols:
    df[col] = df.groupby('city_name')[col].fillna(method='ffill', limit=3)

print(f"\n‚úì Filled lag/rolling features within cities")

# Fill weather features with city-specific medians
weather_cols = [c for c in df.columns if any(w in c for w in ['temperature', 'humidity', 'pressure', 'wind'])]
for col in weather_cols:
    if col in df.columns and df[col].isnull().sum() > 0:
        df[col] = df.groupby('city_name')[col].transform(lambda x: x.fillna(x.median()))

print(f"‚úì Filled weather features with city medians")

# Check remaining missing
remaining_missing = df.isnull().sum().sum()
print(f"\nRemaining missing values: {remaining_missing:,} ({remaining_missing/(len(df)*len(df.columns))*100:.2f}%)")


üîß HANDLING MISSING VALUES
Dropping 4 columns with >80.0% missing:
  - location
  - city
  - country
  - state

‚úì Filled lag/rolling features within cities
‚úì Filled weather features with city medians

Remaining missing values: 12,040 (0.20%)


In [13]:
# =============================================================================
# 12. CREATE TRAIN/VALIDATION/TEST SPLITS
# =============================================================================

print("\n" + "=" * 70)
print("‚úÇÔ∏è CREATING TRAIN/VAL/TEST SPLITS")
print("=" * 70)

# Sort by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)

# Use temporal split (important for time series)
train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))

df_train = df.iloc[:train_size].copy()
df_val = df.iloc[train_size:train_size+val_size].copy()
df_test = df.iloc[train_size+val_size:].copy()

print(f"Train set: {len(df_train):,} records ({len(df_train)/len(df)*100:.1f}%)")
print(f"  Date range: {df_train['timestamp'].min()} to {df_train['timestamp'].max()}")

print(f"\nValidation set: {len(df_val):,} records ({len(df_val)/len(df)*100:.1f}%)")
print(f"  Date range: {df_val['timestamp'].min()} to {df_val['timestamp'].max()}")

print(f"\nTest set: {len(df_test):,} records ({len(df_test)/len(df)*100:.1f}%)")
print(f"  Date range: {df_test['timestamp'].min()} to {df_test['timestamp'].max()}")


‚úÇÔ∏è CREATING TRAIN/VAL/TEST SPLITS
Train set: 15,789 records (70.0%)
  Date range: 2025-08-31 18:00:00 to 2025-11-05 15:00:00

Validation set: 3,383 records (15.0%)
  Date range: 2025-11-05 15:00:00 to 2025-11-19 17:00:00

Test set: 3,384 records (15.0%)
  Date range: 2025-11-19 17:00:00 to 2025-12-03 16:00:00


In [14]:
# =============================================================================
# 13. SAVE ENGINEERED FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üíæ SAVING ENGINEERED FEATURES")
print("=" * 70)

output_dir = Path('../data/processed')
output_dir.mkdir(exist_ok=True)

# Save full dataset
full_path = output_dir / 'features_engineered_full.csv'
df.to_csv(full_path, index=False)
print(f"‚úì Saved full dataset: {full_path}")

# Save train/val/test sets
train_path = output_dir / 'features_train.csv'
val_path = output_dir / 'features_val.csv'
test_path = output_dir / 'features_test.csv'

df_train.to_csv(train_path, index=False)
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)

print(f"‚úì Saved train set: {train_path}")
print(f"‚úì Saved validation set: {val_path}")
print(f"‚úì Saved test set: {test_path}")

# Save feature list
feature_list = {
    'all_features': df.columns.tolist(),
    'numeric_features': df.select_dtypes(include=[np.number]).columns.tolist(),
    'categorical_features': df.select_dtypes(include=['object']).columns.tolist(),
    'lag_features': [c for c in df.columns if 'lag' in c],
    'rolling_features': [c for c in df.columns if 'rolling' in c],
    'temporal_features': [c for c in df.columns if any(t in c for t in ['sin', 'cos', 'hour', 'dow', 'month', 'weekend', 'rush', 'night'])]
}

import json
feature_list_path = output_dir / 'feature_list.json'
with open(feature_list_path, 'w') as f:
    json.dump(feature_list, f, indent=2)

print(f"‚úì Saved feature list: {feature_list_path}")

print("\n" + "=" * 70)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("=" * 70)

print("\nüìä Final Dataset Statistics:")
print(f"  Total records: {len(df):,}")
print(f"  Total features: {len(df.columns)}")
print(f"  Train records: {len(df_train):,}")
print(f"  Val records: {len(df_val):,}")
print(f"  Test records: {len(df_test):,}")
print(f"  Missing data: {df.isnull().sum().sum()/(len(df)*len(df.columns))*100:.2f}%")

print("\nüéØ Next Steps:")
print("1. Review feature importance analysis")
print("2. Proceed to model training (Phase 5)")
print("3. Use features_train.csv, features_val.csv, features_test.csv")


üíæ SAVING ENGINEERED FEATURES
‚úì Saved full dataset: ..\data\processed\features_engineered_full.csv
‚úì Saved train set: ..\data\processed\features_train.csv
‚úì Saved validation set: ..\data\processed\features_val.csv
‚úì Saved test set: ..\data\processed\features_test.csv
‚úì Saved feature list: ..\data\processed\feature_list.json

‚úÖ FEATURE ENGINEERING COMPLETE!

üìä Final Dataset Statistics:
  Total records: 22,556
  Total features: 271
  Train records: 15,789
  Val records: 3,383
  Test records: 3,384
  Missing data: 0.20%

üéØ Next Steps:
1. Review feature importance analysis
2. Proceed to model training (Phase 5)
3. Use features_train.csv, features_val.csv, features_test.csv
