In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('UniversalBank.csv')

# Use a smaller subset of the data
data_subset = data.sample(frac=0.02, random_state=42)  # Reduced to 2%

# Drop the columns that do not contain useful information
X = data_subset.drop(columns=['ID', 'ZIP Code'])
y = data_subset['Income']

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a simpler pipeline without polynomial features
simple_model = Pipeline([
    ('scaler', StandardScaler()),  # Scaling features to improve performance
    ('svr', SVR(kernel='linear'))  # Using only linear kernel for simplicity
])

# Define the parameter grid for SVR
param_grid = {
    'svr__C': [0.1, 1, 10],  # Simplified range
    'svr__epsilon': [0.01, 0.1, 1]  # Simplified range
}

# Create a GridSearchCV object with reduced cross-validation folds
grid_search = GridSearchCV(simple_model, param_grid, cv=3, scoring='neg_mean_squared_error')  # Reduced cv to 3

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 4.5729407005903754e-05
R-squared: 0.9999999829297024
