In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Load the dataset
file_path = '/content/housePricing.csv'
house_data = pd.read_csv(file_path)

In [4]:
# Preprocessing: Remove columns with excessive missing values
missing_threshold = 0.5
house_data = house_data.drop(columns=house_data.columns[house_data.isnull().mean() > missing_threshold])

# Fill missing values: median for numerical and mode for categorical columns
numeric_cols = house_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = house_data.select_dtypes(include=['object']).columns
house_data[numeric_cols] = house_data[numeric_cols].fillna(house_data[numeric_cols].median())
house_data[categorical_cols] = house_data[categorical_cols].fillna(house_data[categorical_cols].mode().iloc[0])

In [5]:
# Encoding categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(house_data[categorical_cols])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_cols))

# Merge encoded categorical data with numerical data
house_data_final = house_data[numeric_cols].join(encoded_categorical_df)
house_data_final['Log_SalePrice'] = np.log1p(house_data_final['SalePrice'])



In [6]:
# Separate features and target
X = house_data_final.drop('Log_SalePrice', axis=1)
y = house_data_final['Log_SalePrice']

# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
# Create and train the SVM regressor
svm_regressor = SVR(kernel='rbf')
param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
grid_search = GridSearchCV(svm_regressor, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [8]:
# Evaluate the model on training, validation, and testing sets
y_train_pred = grid_search.predict(X_train)
y_valid_pred = grid_search.predict(X_valid)
y_test_pred = grid_search.predict(X_test)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_valid = mean_squared_error(y_valid, y_valid_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_valid = r2_score(y_valid, y_valid_pred)
r2_test = r2_score(y_test, y_test_pred)

In [9]:
# Results
results = {
    'Training MSE': mse_train,
    'Training R2': r2_train,
    'Validation MSE': mse_valid,
    'Validation R2': r2_valid,
    'Test MSE': mse_test,
    'Test R2': r2_test,
    'Best Parameters': grid_search.best_params_
}
results


{'Training MSE': 0.004803159375570645,
 'Training R2': 0.9682412189967154,
 'Validation MSE': 0.008466008266244237,
 'Validation R2': 0.9572146781595078,
 'Test MSE': 0.004551161787997899,
 'Test R2': 0.9681097468537943,
 'Best Parameters': {'C': 10, 'gamma': 'scale'}}