In [None]:
Q1. In order to predict house price based on several characteristics, such as location, square footage,
number of bedrooms, etc., you are developing an SVM regression model. Which regression metric in this
situation would be the best to employ?

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Generating a synthetic dataset for demonstration
# In practice, replace this with your actual dataset
np.random.seed(42)
num_samples = 200
X = pd.DataFrame({
    'location': np.random.rand(num_samples) * 100,  # Example feature
    'square_footage': np.random.rand(num_samples) * 3000,
    'num_bedrooms': np.random.randint(1, 6, num_samples)
})
y = (X['square_footage'] * 300 + X['num_bedrooms'] * 20000 + np.random.randn(num_samples) * 5000).values  # Target variable

# 1. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Create an instance of the SVR classifier and train it on the training data
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # Using RBF kernel
svr.fit(X_train_scaled, y_train)

# 4. Use the trained classifier to predict the labels of the testing data
y_pred = svr.predict(X_test_scaled)

# 5. Evaluate the performance of the classifier using MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')


In [None]:
Q2. You have built an SVM regression model and are trying to decide between using MSE or R-squared as
your evaluation metric. Which metric would be more appropriate if your goal is to predict the actual price
of a house as accurately as possible?

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Generating a synthetic dataset for demonstration
# Replace this with your actual dataset as needed
np.random.seed(42)
num_samples = 200
X = pd.DataFrame({
    'location': np.random.rand(num_samples) * 100,  # Example feature
    'square_footage': np.random.rand(num_samples) * 3000,
    'num_bedrooms': np.random.randint(1, 6, num_samples)
})
y = (X['square_footage'] * 300 + X['num_bedrooms'] * 20000 + np.random.randn(num_samples) * 5000).values  # Target variable

# 1. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Create an instance of the SVR classifier and train it on the training data
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # Using RBF kernel
svr.fit(X_train_scaled, y_train)

# 4. Use the trained classifier to predict the labels of the testing data
y_pred = svr.predict(X_test_scaled)

# 5. Evaluate the performance of the classifier using MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R²): {r_squared:.2f}')


In [None]:
Q3. You have a dataset with a significant number of outliers and are trying to select an appropriate regression metric
to use with your SVM model. Which metric would be the most appropriate in this scenario?

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Generating a synthetic dataset with outliers
np.random.seed(42)
num_samples = 200

# Features
X = pd.DataFrame({
    'location': np.random.rand(num_samples) * 100,  # Example feature
    'square_footage': np.random.rand(num_samples) * 3000,
    'num_bedrooms': np.random.randint(1, 6, num_samples)
})

# Target variable with added outliers
y = (X['square_footage'] * 300 + X['num_bedrooms'] * 20000 + np.random.randn(num_samples) * 5000).values

# Introduce outliers
y[::10] += np.random.randint(20000, 50000, size=y[::10].shape)  # Adding significant outliers

# 1. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Create an instance of the SVR classifier and train it on the training data
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # Using RBF kernel
svr.fit(X_train_scaled, y_train)

# 4. Use the trained classifier to predict the labels of the testing data
y_pred = svr.predict(X_test_scaled)

# 5. Evaluate the performance of the classifier using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae:.2f}')


In [None]:
Q4. You have built an SVM regression model using a polynomial kernel and are trying to select the best
metric to evaluate its performance. You have calculated both MSE and RMSE and found that both values
are very close. Which metric should you choose to use in this case?

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

# Generating a synthetic dataset
np.random.seed(42)
num_samples = 200
X = pd.DataFrame({
    'feature1': np.random.rand(num_samples) * 100,
    'feature2': np.random.rand(num_samples) * 50
})

# Target variable with some noise
y = (X['feature1'] * 2 + X['feature2'] * 3 + np.random.randn(num_samples) * 10).values

# 1. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Create polynomial features
poly = PolynomialFeatures(degree=2)  # Adjust degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# 4. Create an instance of the SVR classifier with a polynomial kernel and train it
svr = SVR(kernel='poly', degree=2, C=1.0, epsilon=0.1)  # Using polynomial kernel
svr.fit(X_train_poly, y_train)

# 5. Use the trained classifier to predict the labels of the testing data
y_pred = svr.predict(X_test_poly)

# 6. Evaluate the performance using MSE and RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Calculate RMSE from MSE

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

# Optionally, choose RMSE for final evaluation
print("Recommended Metric: RMSE")


In [None]:
Q5. You are comparing the performance of different SVM regression models using different kernels (linear,
polynomial, and RBF) and are trying to select the best evaluation metric. Which metric would be most
appropriate if your goal is to measure how well the model explains the variance in the target variable?

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Generating a synthetic dataset
np.random.seed(42)
num_samples = 200
X = pd.DataFrame({
    'feature1': np.random.rand(num_samples) * 100,
    'feature2': np.random.rand(num_samples) * 50
})

# Target variable with some noise
y = (X['feature1'] * 2 + X['feature2'] * 3 + np.random.randn(num_samples) * 10).values

# 1. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of kernels to evaluate
kernels = ['linear', 'poly', 'rbf']
r2_scores = {}

for kernel in kernels:
    # 3. Create an instance of the SVR classifier with the specified kernel
    svr = SVR(kernel=kernel, C=1.0, epsilon=0.1, degree=2 if kernel == 'poly' else 3)  # degree for polynomial kernel
    svr.fit(X_train_scaled, y_train)
    
    # 4. Use the trained classifier to predict the labels of the testing data
    y_pred = svr.predict(X_test_scaled)
    
    # 5. Calculate R-squared for the model
    r2 = r2_score(y_test, y_pred)
    r2_scores[kernel] = r2
    print(f'R-squared for {kernel} kernel: {r2:.4f}')

# Summary of R-squared scores
best_kernel = max(r2_scores, key=r2_scores.get)
print(f'\nBest performing kernel: {best_kernel} with R-squared: {r2_scores[best_kernel]:.4f}')
