In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset
file_path = 'Infrared.csv'
data = pd.read_csv(file_path)

# Define features and target variable
X = data.drop(columns=['aveOralM'])
y = data['aveOralM']

# Preprocessing pipeline for categorical and numerical features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing steps for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Standardize features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models for K-NN and Decision Tree
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('knn', KNeighborsRegressor(n_neighbors=5))])

dt_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('dt', DecisionTreeRegressor(random_state=42))])

# Train K-NN model
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# Train Decision Tree model
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# Evaluate models using MSE, RMSE, and R2
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

knn_metrics = evaluate_model(y_test, knn_predictions)
dt_metrics = evaluate_model(y_test, dt_predictions)

print(f"K-NN Metrics: MSE = {knn_metrics[0]:.4f}, RMSE = {knn_metrics[1]:.4f}, R² = {knn_metrics[2]:.4f}")
print(f"Decision Tree Metrics: MSE = {dt_metrics[0]:.4f}, RMSE = {dt_metrics[1]:.4f}, R² = {dt_metrics[2]:.4f}")


K-NN Metrics: MSE = 0.0648, RMSE = 0.2545, R² = 0.6924
Decision Tree Metrics: MSE = 0.1120, RMSE = 0.3347, R² = 0.4681
