# **Support Vector Machine-3**

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [96]:
df = pd.read_csv("bengaluru_house_data.csv")

In [97]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [98]:
df['size'] = df['size'].str.split(" ").str[0]

In [99]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [100]:
df.shape

(13320, 9)

In [101]:
# Create a mapping dictionary
area_type_mapping = {value: index for index, value in enumerate(df['area_type'].unique())}

# Map the strings to numbers
df['area_type'] = df['area_type'].map(area_type_mapping)

df.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,1,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,2,Ready To Move,Uttarahalli,3,,1440,2.0,3.0,62.0
3,0,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,0,Ready To Move,Kothanur,2,,1200,2.0,1.0,51.0


In [102]:
# Define a function to handle the splitting
def split_availability(value):
    if ' ' not in value and '-' in value:
        return value.split('-')[1]
    return value

# Apply the function to the 'availability' column
df['availability'] = df['availability'].apply(split_availability)

df.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,1,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,2,Ready To Move,Uttarahalli,3,,1440,2.0,3.0,62.0
3,0,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,0,Ready To Move,Kothanur,2,,1200,2.0,1.0,51.0


In [103]:
# Custom mapping dictionary
availability_mapping = {
    'Ready To Move': 0,
    'Immediate Possession': 1,
    'Jan':2,
    'Feb': 3,
    'Mar': 4,
    'Apr': 5,
    'May': 6,
    'Jun': 7,
    'Jul': 8,
    'Aug': 9,
    'Sep': 10,
    'Oct': 11,
    'Nov': 12,
    'Dec': 13,
}

# Map the custom values to numbers
df['availability_numeric'] = df['availability'].map(availability_mapping)




In [104]:
rows_with_nan = df[df['location'].isna()]
rows_with_nan

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,availability_numeric
568,0,Ready To Move,,3,Grare S,1600,3.0,2.0,86.0,0


In [105]:
df['location'].replace('', np.nan, inplace=True)

df.dropna(subset=['location'], inplace=True)

df['location'].isnull().sum()

0

In [106]:
df.isnull().sum()

area_type                  0
availability               0
location                   0
size                      16
society                 5502
total_sqft                 0
bath                      73
balcony                  609
price                      0
availability_numeric       0
dtype: int64

In [107]:
df['size'].fillna(df['size'].mode()[0], inplace=True)


df['society'].fillna('Not Available', inplace=True)


df['bath'].fillna(df['bath'].median(), inplace=True)
df['balcony'].fillna(df['balcony'].median(), inplace=True)

In [108]:
df.isnull().sum()

area_type               0
availability            0
location                0
size                    0
society                 0
total_sqft              0
bath                    0
balcony                 0
price                   0
availability_numeric    0
dtype: int64

In [109]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price', 'availability_numeric'],
      dtype='object')

In [110]:
from sklearn.preprocessing import LabelEncoder

In [111]:
label_encoder = LabelEncoder()
df['location'] = label_encoder.fit_transform(df['area_type'])
df['society'] = label_encoder.fit_transform(df['availability'])

In [112]:
def convert_sqft_to_num(x):
    if '-' in x:
        tokens = x.split('-')
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None
    
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

In [113]:
df.dropna(subset=['total_sqft'],inplace=True)

In [114]:
df['total_sqft'].isnull().sum()

0

In [115]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,availability_numeric
0,0,Dec,0,2,2,1056.0,2.0,1.0,39.07,13
1,1,Ready To Move,1,4,12,2600.0,5.0,3.0,120.0,0
2,2,Ready To Move,2,3,12,1440.0,2.0,3.0,62.0,0
3,0,Ready To Move,0,3,12,1521.0,3.0,1.0,95.0,0
4,0,Ready To Move,0,2,12,1200.0,2.0,1.0,51.0,0


In [116]:
X = df[['location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'area_type',
       'availability_numeric']]

In [117]:
X

Unnamed: 0,location,size,society,total_sqft,bath,balcony,area_type,availability_numeric
0,0,2,2,1056.0,2.0,1.0,0,13
1,1,4,12,2600.0,5.0,3.0,1,0
2,2,3,12,1440.0,2.0,3.0,2,0
3,0,3,12,1521.0,3.0,1.0,0,0
4,0,2,12,1200.0,2.0,1.0,0,0
...,...,...,...,...,...,...,...,...
13315,2,5,12,3453.0,4.0,0.0,2,0
13316,0,4,12,3600.0,5.0,2.0,0,0
13317,2,2,12,1141.0,2.0,1.0,2,0
13318,0,4,7,4689.0,4.0,1.0,0,7


In [118]:
y = df['price']

In [119]:
y

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13315    231.00
13316    400.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 13274, dtype: float64

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
svm_model = SVR(kernel='rbf')  


In [122]:
X.isnull().sum()

location                0
size                    0
society                 0
total_sqft              0
bath                    0
balcony                 0
area_type               0
availability_numeric    0
dtype: int64

In [123]:
svm_model.fit(X_train, y_train)

In [124]:
from sklearn.metrics import r2_score

In [125]:
y_pred = svm_model.predict(X_test)
print(r2_score(y_test,y_pred))

0.24691310496765384


In [126]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 22823.96797449101


In [127]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel':['linear'],
              'epsilon':[0.1,0.2,0.3]
              }

In [128]:
grid=GridSearchCV(SVR(),param_grid=param_grid,refit=True,cv=5,verbose=3)

In [129]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END C=0.1, epsilon=0.1, gamma=1, kernel=linear;, score=0.473 total time= 3.6min
[CV 2/5] END C=0.1, epsilon=0.1, gamma=1, kernel=linear;, score=0.192 total time= 4.0min
[CV 3/5] END C=0.1, epsilon=0.1, gamma=1, kernel=linear;, score=0.338 total time= 3.9min
[CV 4/5] END C=0.1, epsilon=0.1, gamma=1, kernel=linear;, score=0.227 total time= 3.3min
[CV 5/5] END C=0.1, epsilon=0.1, gamma=1, kernel=linear;, score=0.355 total time= 5.9min
[CV 1/5] END C=0.1, epsilon=0.1, gamma=0.1, kernel=linear;, score=0.473 total time= 3.5min
[CV 2/5] END C=0.1, epsilon=0.1, gamma=0.1, kernel=linear;, score=0.192 total time= 3.9min
[CV 3/5] END C=0.1, epsilon=0.1, gamma=0.1, kernel=linear;, score=0.338 total time= 3.5min
[CV 4/5] END C=0.1, epsilon=0.1, gamma=0.1, kernel=linear;, score=0.227 total time= 2.9min
[CV 5/5] END C=0.1, epsilon=0.1, gamma=0.1, kernel=linear;, score=0.355 total time= 5.2min
[CV 1/5] END C=0.1, epsilon=0.1, gamma

In [None]:
grid.best_params_

In [None]:
from sklearn.metrics import r2_score
y_pred4=grid.predict(X_test)
print(r2_score(y_test,y_pred4))


### Q1. Best Regression Metric for SVM Regression Model

**Question**: In order to predict house prices based on several characteristics using an SVM regression model, which regression metric would be the best to employ?

**Answer**: For predicting house prices with an SVM regression model, the **Mean Squared Error (MSE)** would be a suitable regression metric to employ. MSE measures the average squared difference between predicted and actual values, giving a good measure of how close the predictions are to the actual values.



### Q2. MSE vs. R-squared for Predicting House Prices

**Question**: You have built an SVM regression model and are deciding between using MSE or R-squared as your evaluation metric. Which metric would be more appropriate if your goal is to predict the actual price of a house as accurately as possible?

**Answer**: If your goal is to predict the actual price of a house as accurately as possible, then **MSE (Mean Squared Error)** would be more appropriate. MSE directly measures the average squared difference between predicted and actual values, giving a clear indication of how far off your predictions are on average.

### Q3. Regression Metric for Dataset with Outliers

**Question**: You have a dataset with a significant number of outliers and need to select an appropriate regression metric for your SVM model. Which metric would be the most appropriate in this scenario?

**Answer**: When dealing with a dataset that has significant outliers, **Mean Absolute Error (MAE)** would be the most appropriate regression metric. MAE is less sensitive to outliers compared to MSE because it measures the average absolute difference between predicted and actual values.

### Q4. Choosing Between MSE and RMSE for SVM with Polynomial Kernel

**Question**: You have built an SVM regression model using a polynomial kernel and are trying to select the best metric to evaluate its performance. Both MSE and RMSE values are very close. Which metric should you choose?

**Answer**: Since MSE and RMSE (Root Mean Squared Error) are very close in this case, **MSE (Mean Squared Error)** would generally be the preferred choice because it directly reflects the average squared difference between predicted and actual values without the additional step of taking the square root as in RMSE.

### Q5. Best Evaluation Metric for Comparing SVM Regression Models with Different Kernels

**Question**: You are comparing the performance of different SVM regression models using linear, polynomial, and RBF kernels. Which metric would be most appropriate if your goal is to measure how well the model explains the variance in the target variable?

**Answer**: When comparing models to measure how well they explain the variance in the target variable (i.e., how well they fit the data), **R-squared (Coefficient of Determination)** would be the most appropriate metric. R-squared indicates the proportion of the variance in the dependent variable that is predictable from the independent variables. A higher R-squared value indicates a better fit of the model to the data.

These answers provide a structured approach to selecting appropriate regression metrics based on different scenarios in SVM regression modeling. Adjust the SVM model parameters and data handling according to your specific dataset and requirements.