In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('../Weather/merged_weather_tyre_data_2019.csv')

# Define features and target
X = data[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'GP', 'Lap']]
y = data['Tyres']
 
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']] = imputer.fit_transform(
    X[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']]
)

# Drop rows where 'Tyres' is missing
X = X.dropna()
y = y[X.index]

# One-hot encode the categorical column 'GP'
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = pd.DataFrame(
    encoder.fit_transform(X[['GP']]),
    columns=encoder.get_feature_names_out(['GP']),
    index=X.index  # Ensure index alignment
)
# Add one-hot encoded columns to X and drop the original 'GP' column
X = pd.concat([X.drop(columns=['GP']), X_encoded], axis=1)

# Reset indices to ensure alignment between X and y
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(kernel='rbf')  # Using RBF kernel for SVM
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print("-" * 50)

# Cross-validation for robustness
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-validation scores for {name}: {scores}")
    print(f"Mean CV Accuracy: {scores.mean()}")
    print("-" * 50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['AirTemp', 'Humidity', 'Rainfall', 'TrackTemp', 'Lap']] = imputer.fit_transform(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.47149927487044324


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                      precision    recall  f1-score   support

            Hard New       0.56      0.56      0.56    272176
           Hard Used       0.00      0.00      0.00     33557
    Intermediate New       0.59      0.41      0.49     57520
   Intermediate Used       0.00      0.00      0.00      3454
Lluvia Extrema Usado       0.00      0.00      0.00      3881
          Medium New       0.44      0.76      0.55    383143
         Medium Used       0.49      0.13      0.21    108748
            Soft New       0.44      0.05      0.09     96781
           Soft Used       0.42      0.29      0.34    159058
             Wet New       0.46      0.22      0.30     14583

            accuracy                           0.47   1132901
           macro avg       0.34      0.24      0.25   1132901
        weighted avg       0.46      0.47      0.43   1132901

--------------------------------------------------
Model: Decision Tree
Accuracy: 0.45868262098806517
Clas