In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('/kaggle/input/cs770-assignment-2b/train_data_insurance.csv')

# Preprocess the data
data = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], drop_first=True)
scaler = StandardScaler()
data[['age', 'bmi', 'children']] = scaler.fit_transform(data[['age', 'bmi', 'children']])

# Split the data into features and target
X = data.drop('charges', axis=1)
y = data['charges']

# Calculate the number of samples for the validation set (20% of the data)
validation_size = 268

# Split the data into training and validation sets without randomization
X_train, y_train = X.iloc[:len(X) - validation_size], y.iloc[:len(y) - validation_size]
X_val, y_val = X.iloc[len(X) - validation_size:], y.iloc[len(y) - validation_size:]

# Define SVM models
svm_models = {
    'Linear': SVR(kernel='linear'),
    'Polynomial': SVR(kernel='poly'),
    'RBF': SVR(kernel='rbf')
}

best_models = {}  # Store the best model for each kernel
best_mses = {}  # Store the MSE for the best model of each kernel

# Perform Grid Search for hyperparameters for each model
for kernel, model in svm_models.items():
    if kernel == 'Linear':
        param_grid = {'C': [0.1, 1, 10]}
    elif kernel == 'Polynomial':
        param_grid = {'C': [0.1, 1, 10], 'degree': [2, 3, 4]}
    else:
        param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_models[kernel] = grid_search.best_estimator_
    best_mses[kernel] = -grid_search.best_score_

# Create a DataFrame to store predictions for all models
all_predictions = pd.DataFrame()

# Make predictions for all models
for kernel, model in best_models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    all_predictions[f'{kernel}_predictions'] = predictions

# Evaluate model performance and print the evaluation metrics
for kernel in best_models.keys():
    mse = mean_squared_error(y_val, all_predictions[f'{kernel}_predictions'])
    r2 = r2_score(y_val, all_predictions[f'{kernel}_predictions'])
    print(f"{kernel} Kernel:")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}\n")

# Add an index column from 1 to 216
all_predictions['Index'] = range(1, len(all_predictions) + 1)

# Reorder the columns to have 'Index' as the first column
all_predictions = all_predictions[['Index'] + [col for col in all_predictions.columns if col != 'Index']]
    
# Save all predictions to a CSV file
all_predictions.to_csv('validation_predictions.csv', index=False)
Linear Kernel:
Mean Squared Error: 159370938.91931975
R-squared: -0.020917013888538394

Polynomial Kernel:
Mean Squared Error: 168844086.8912087
R-squared: -0.08160121393884445

RBF Kernel:
Mean Squared Error: 167374094.7707974
R-squared: -0.07218456636064507
