## Support Vector Machine

## Q1. In order to predict house price based on several characteristics, such as location, square footage, number of bedrooms, etc., you are developing an SVM regression model. Which regression metric in this situation would be the best to employ?

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

## Data cleaning

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df[df['location'].isnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,Super built-up Area,Ready To Move,,3 BHK,Grare S,1600,3.0,2.0,86.0


In [7]:
df.drop(index=568,inplace=True)

In [8]:
df['location'].isnull().sum()

0

In [9]:
df['size'] = df['size'].str.split(' ').str[0]

In [10]:
df['size']

0        2
1        4
2        3
3        3
4        2
        ..
13315    5
13316    4
13317    2
13318    4
13319    1
Name: size, Length: 13319, dtype: object

In [11]:
df.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0


In [12]:
df[df['size'].isnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
579,Plot Area,Immediate Possession,Sarjapur Road,,Asiss B,1200 - 2400,,,34.185
1775,Plot Area,Immediate Possession,IVC Road,,Orana N,2000 - 5634,,,124.0
2264,Plot Area,Immediate Possession,Banashankari,,,2400,,,460.0
2809,Plot Area,Immediate Possession,Sarjapur Road,,AsdiaAr,1200 - 2400,,,28.785
2862,Plot Area,Immediate Possession,Devanahalli,,Ajleyor,1500 - 2400,,,46.8
5333,Plot Area,Immediate Possession,Devanahalli,,Emngs S,2100 - 5405,,,177.115
6423,Plot Area,Immediate Possession,Whitefield,,SRniaGa,2324,,,26.73
6636,Plot Area,Immediate Possession,Jigani,,S2enste,1500,,,25.49
6719,Plot Area,Immediate Possession,Hoskote,,SJowsn,800 - 2660,,,28.545
7680,Plot Area,Immediate Possession,Kasavanhalli,,,5000,,,400.0


In [13]:
df['size'] = df['size'].fillna(0)

In [14]:
df[df['size'].isnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price


In [15]:
df['size'] = df['size'].astype(int)

In [16]:
print(df['size'].unique())

[ 2  4  3  6  1  8  7  5 11  9  0 27 10 19 16 43 14 12 13 18]


In [17]:
df['size'].mode()

0    2
Name: size, dtype: int32

In [18]:
df.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13319 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13319 non-null  object 
 1   availability  13319 non-null  object 
 2   location      13319 non-null  object 
 3   size          13319 non-null  int32  
 4   society       7817 non-null   object 
 5   total_sqft    13319 non-null  object 
 6   bath          13246 non-null  float64
 7   balcony       12710 non-null  float64
 8   price         13319 non-null  float64
dtypes: float64(3), int32(1), object(5)
memory usage: 988.5+ KB


In [20]:
df['total_sqft'] = df['total_sqft'].str.split(' - ').str[0]

In [21]:
print(df['total_sqft'].unique())

['1056' '2600' '1440' ... '2758' '774' '4689']


In [22]:
df['total_sqft'] = df['total_sqft'].str.split('.').str[0]

In [23]:
df['total_sqft'] = df['total_sqft'].str.replace('Perch','')

In [24]:
df['total_sqft'] = df['total_sqft'].str.replace('Sq','')

In [25]:
df['total_sqft'] = df['total_sqft'].str.replace('Acres','')

In [26]:
df['total_sqft'] = df['total_sqft'].str.replace('Cents','')

In [27]:
df['total_sqft'] = df['total_sqft'].str.replace('Guntha','')

In [28]:
df['total_sqft'] = df['total_sqft'].str.replace('Grounds','')

In [29]:
df['total_sqft'] = df['total_sqft'].astype(int)

In [30]:
df.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0


In [31]:
df.corr

<bound method DataFrame.corr of                   area_type   availability                  location  size  \
0      Super built-up  Area         19-Dec  Electronic City Phase II     2   
1                Plot  Area  Ready To Move          Chikka Tirupathi     4   
2            Built-up  Area  Ready To Move               Uttarahalli     3   
3      Super built-up  Area  Ready To Move        Lingadheeranahalli     3   
4      Super built-up  Area  Ready To Move                  Kothanur     2   
...                     ...            ...                       ...   ...   
13315        Built-up  Area  Ready To Move                Whitefield     5   
13316  Super built-up  Area  Ready To Move             Richards Town     4   
13317        Built-up  Area  Ready To Move     Raja Rajeshwari Nagar     2   
13318  Super built-up  Area         18-Jun           Padmanabhanagar     4   
13319  Super built-up  Area  Ready To Move              Doddathoguru     1   

       society  total_sqft  bat

## Standardization

In [32]:
df['location'].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [33]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

In [34]:
## df1 = pd.DataFrame(encoder.fit_transform(df[['area_type','location']]).toarray(),columns=encoder.get_feature_names_out())

In [35]:
## df = pd.concat([df,df1],axis=1)

In [36]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,,1200,2.0,1.0,51.0


In [37]:
df['bath'] = df['bath'].fillna(np.mean)

In [38]:
df['balcony'].unique()

array([ 1.,  3., nan,  2.,  0.])

In [39]:
df.drop(index=13316,inplace=True)

In [40]:
df['balcony'] = df['balcony'].fillna(np.mean)

In [41]:
df['bath'].unique()

array([2.0, 5.0, 3.0, 4.0, 6.0, 1.0, 9.0,
       <function mean at 0x00000219A101AEE0>, 8.0, 7.0, 11.0, 10.0, 14.0,
       27.0, 12.0, 16.0, 40.0, 15.0, 13.0, 18.0], dtype=object)

In [42]:
df['bath'] = df['bath'].str.replace('<function mean at 0x00000177CCE7BF70>','2.0')

In [43]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,,1440,,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,,1200,,1.0,51.0


In [44]:
x = df[['size','total_sqft']]
y = df['price']

In [45]:
df.corr()

Unnamed: 0,size,total_sqft,bath,price
size,1.0,0.345191,,0.397156
total_sqft,0.345191,1.0,,0.573143
bath,,,,
price,0.397156,0.573143,,1.0


In [46]:
from sklearn.model_selection import train_test_split
x_train,x_teat,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=12)

In [47]:
from sklearn.svm import SVR
svr = SVR()

In [48]:
svr.fit(x_train,y_train)

SVR()

In [51]:
y_pred = svr.predict(x_teat)

In [52]:
y_pred

array([ 41.09988516,  87.90420322,  55.59124341, ..., 142.41145692,
       208.81499063, 104.97327996])

In [53]:
y_test

4161      40.0
11640     90.0
746       52.0
12260    470.0
7598     250.0
         ...  
8343     320.0
3618      70.0
3570     139.0
10250    501.0
4462     100.0
Name: price, Length: 4395, dtype: float64

In [54]:
from sklearn.metrics import r2_score,mean_absolute_error
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

42.50746300391551
0.30162656287570544
