In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('../Weather/merged_weather_tyre_data_2019.csv')

# Define features and target
X = data[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'GP', 'Lap']]
y = data['Tyres']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
imputed_values = imputer.fit_transform(X[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']])
# Convert imputed values to a DataFrame with the same columns and index
imputed_df = pd.DataFrame(
    imputed_values,
    columns=['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap'],
    index=X.index
)

# Cast each column to its original dtype
for col in ['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']:
    imputed_df[col] = imputed_df[col].astype(X[col].dtype)

# Assign back to the DataFrame
X.loc[:, ['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']] = imputed_df



# Drop rows where 'Tyres' is missing
X = X.dropna()
y = y.loc[X.index]

# One-hot encode the categorical column 'GP'
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = pd.DataFrame(
    encoder.fit_transform(X[['GP']]),
    columns=encoder.get_feature_names_out(['GP']),
    index=X.index  # Ensure index alignment
)
# Add one-hot encoded columns to X and drop the original 'GP' column
X = pd.concat([X.drop(columns=['GP']), X_encoded], axis=1)

# Standardize features
scaler = StandardScaler()
scaled_values = scaler.fit_transform(X.iloc[:, :-len(X_encoded.columns)])
scaled_columns = pd.DataFrame(
    scaled_values,
    columns=X.columns[:-len(X_encoded.columns)],
    index=X.index
)
X.loc[:, scaled_columns.columns] = scaled_columns.astype(X[scaled_columns.columns].dtypes.to_dict())


# Reset indices to ensure alignment between X and y
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers with improvements
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "SVM": SVC(kernel='rbf', class_weight='balanced')  # Using RBF kernel for SVM
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred, zero_division=1)}")
    print("-" * 50)

# Cross-validation for robustness with parallel processing
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy', n_jobs=-1)
    print(f"Cross-validation scores for {name}: {scores}")
    print(f"Mean CV Accuracy: {scores.mean()}")
    print("-" * 50)


KeyError: "None of [Index(['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'GP', 'Lap'], dtype='object')] are in the [columns]"