<a href="https://colab.research.google.com/github/Ilian10Janopullo/Machine-Learning-Project/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from datetime import datetime

def preprocess_ev_data():
    # Load data
    file_path = '/content/Electric_Vehicle_Population_Data.csv'
    data = pd.read_csv(file_path)
    print("Initial shape:", data.shape)
    print("\nOriginal columns:", data.columns.tolist())

    # Clean column names
    data.columns = data.columns.str.strip()

    # Drop duplicates
    data.drop_duplicates(inplace=True)
    print("\nAfter dropping duplicates:", data.shape)

    # Convert electric range from miles to kilometers (1 mile = 1.60934 km)
    if 'Electric Range' in data.columns:
        data['Electric Range'] = data['Electric Range'] * 1.60934
        data.rename(columns={'Electric Range': 'Electric Range (km)'}, inplace=True)

    # Drop irrelevant columns
    drop_cols = [
        'VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location',
        '2020 Census Tract', 'Postal Code', 'City', 'State',
        'Legislative District'  # Often not useful for prediction
    ]
    data.drop(columns=[col for col in drop_cols if col in data.columns], inplace=True)

    # Handle missing values
    numeric_features = ['Base MSRP', 'Model Year']
    for feature in numeric_features:
        if feature in data.columns:
            data[feature] = pd.to_numeric(data[feature], errors='coerce')
            median_value = data[feature].median()
            data[feature] = data[feature].fillna(median_value)

    # For electric range, assume missing means 0 (for PHEVs)
    if 'Electric Range (km)' in data.columns:
        data['Electric Range (km)'] = data['Electric Range (km)'].fillna(0)

    # Feature engineering
    current_year = datetime.now().year
    data['Vehicle Age'] = current_year - data['Model Year']

    # Create luxury brand indicator
    luxury_brands = ['Tesla', 'BMW', 'Audi', 'Mercedes-Benz', 'Porsche', 'Jaguar', 'Lexus']
    data['Is Luxury'] = data['Make'].isin(luxury_brands).astype(int)

    # Create EV type binary indicator
    if 'Electric Vehicle Type' in data.columns:
        data['Is Full Electric'] = (data['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int)

    # Handle categorical variables
    categorical_features = [
        'Make', 'Model', 'County', 'Electric Utility',
        'Clean Alternative Fuel Vehicle (CAFV) Eligibility',
        'Electric Vehicle Type'
    ]

    # One-hot encode categoricals (better for linear models)
    data = pd.get_dummies(data, columns=[col for col in categorical_features if col in data.columns], drop_first=True)

    # Normalize only numerical features (excluding target)
    numerical_features = ['Model Year', 'Base MSRP', 'Vehicle Age']
    scaler = MinMaxScaler()
    for feature in numerical_features:
        if feature in data.columns:
            data[feature] = scaler.fit_transform(data[[feature]])

    # Check for remaining missing values
    print("\nMissing values after preprocessing:")
    print(data.isnull().sum())

    # Drop any remaining rows with NaN (should be very few if any)
    data.dropna(inplace=True)

    # Correlation analysis
    plt.figure(figsize=(15, 12))
    sns.heatmap(data.corr(), cmap='coolwarm', center=0)
    plt.title("Feature Correlation Matrix")
    plt.tight_layout()
    plt.savefig("feature_correlation.png")
    plt.close()

    # Mutual information with target
    if 'Electric Range (km)' in data.columns:
        X = data.drop(columns=['Electric Range (km)'])
        y = data['Electric Range (km)']

        mi_scores = mutual_info_regression(X, y, random_state=42)
        mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
        mi_scores = mi_scores.sort_values(ascending=False)

        plt.figure(figsize=(10, 8))
        mi_scores.head(20).plot(kind='barh')
        plt.title("Top 20 Features by Mutual Information with Electric Range")
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        plt.close()

    # Save processed data
    output_dir = 'processed_data'
    os.makedirs(output_dir, exist_ok=True)

    # Save full dataset
    output_path = os.path.join(output_dir, 'processed_ev_data.csv')
    data.to_csv(output_path, index=False)

    # Also save train/test splits
    if 'Electric Range (km)' in data.columns:
        X = data.drop(columns=['Electric Range (km)'])
        y = data['Electric Range (km)']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        pd.concat([X_train, y_train], axis=1).to_csv(
            os.path.join(output_dir, 'train_data.csv'), index=False
        )
        pd.concat([X_test, y_test], axis=1).to_csv(
            os.path.join(output_dir, 'test_data.csv'), index=False
        )

    print(f"\n Preprocessing complete. Data saved to {output_dir}/")
    print("Final shape:", data.shape)
    return data

# Execute the preprocessing
processed_data = preprocess_ev_data()

Initial shape: (113275, 17)

Original columns: ['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year', 'Make', 'Model', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range', 'Base MSRP', 'Legislative District', 'DOL Vehicle ID', 'Vehicle Location', 'Electric Utility', '2020 Census Tract']

After dropping duplicates: (113275, 17)

Missing values after preprocessing:
Model Year                                                                                                        0
Electric Range (km)                                                                                               0
Base MSRP                                                                                                         0
Vehicle Age                                                                                                       0
Is Luxury                                                                                                      