In [1]:
# Installing mlxtend
!pip install -Uqq mlxtend

In [None]:
# Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.svm import SVC
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('Housing.csv')
df.shape

(75388, 23)

In [4]:
df.columns

Index(['SERIAL', 'DENSITY', 'OWNERSHP', 'OWNERSHPD', 'COSTELEC', 'COSTGAS',
       'COSTWATR', 'COSTFUEL', 'HHINCOME', 'ROOMS', 'BUILTYR2', 'BEDROOMS',
       'VEHICLES', 'NFAMS', 'NCOUPLES', 'PERNUM', 'PERWT', 'AGE', 'MARST',
       'BIRTHYR', 'EDUC', 'EDUCD', 'INCTOT'],
      dtype='object')

## Subsetting the data of elder people of the household

In [5]:
data = df.sort_values(['SERIAL', 'AGE'], ascending=[True, False]).drop_duplicates('SERIAL')
data = data.drop(['SERIAL', 'OWNERSHPD', 'PERNUM', 'PERWT', 'BIRTHYR', 'EDUCD'], axis=1)
data.shape

(30802, 17)

In [6]:
#housing_variables = ['DENSITY','COSTELEC','COSTGAS','COSTWATR','COSTFUEL', 'ROOMS', 'BUILTYR2', 'BEDROOMS', 'VEHICLES']
#people_variables = ['HHINCOME','AGE','MARST','EDUC','INCTOT']
#dwelling_variables = ['NFAMS', 'NCOUPLES']

In [7]:
data.columns

Index(['DENSITY', 'OWNERSHP', 'COSTELEC', 'COSTGAS', 'COSTWATR', 'COSTFUEL',
       'HHINCOME', 'ROOMS', 'BUILTYR2', 'BEDROOMS', 'VEHICLES', 'NFAMS',
       'NCOUPLES', 'AGE', 'MARST', 'EDUC', 'INCTOT'],
      dtype='object')

In [8]:
# checking for any NULL Values
data.isnull().any(axis=1).sum()

0

In [9]:
data.dtypes

DENSITY     float64
OWNERSHP      int64
COSTELEC      int64
COSTGAS       int64
COSTWATR      int64
COSTFUEL      int64
HHINCOME    float64
ROOMS         int64
BUILTYR2      int64
BEDROOMS      int64
VEHICLES      int64
NFAMS         int64
NCOUPLES      int64
AGE           int64
MARST         int64
EDUC          int64
INCTOT      float64
dtype: object

In [10]:
# Split data into X and y
X = data.drop('OWNERSHP', axis=1)
y = data['OWNERSHP']

In [11]:
# Preprocess categorical features
cat_cols = ['EDUC', 'MARST']
X_cat = X[cat_cols]
X_cat_enc = OneHotEncoder().fit_transform(X_cat)
X = X.drop(cat_cols, axis=1)
X = np.hstack((X.values, X_cat_enc.toarray()))

In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale continuous features
scaler = StandardScaler().fit(X_train[:, :-4])
X_train[:, :-4] = scaler.transform(X_train[:, :-4])
X_test[:, :-4] = scaler.transform(X_test[:, :-4])

### LINEAR

##### Linear SVM with different COST values

In [13]:
C_values = [0.01, 0.1, 1, 10, 50, 100, 500, 1000]

for C in C_values:
    linear_svc = SVC(kernel='linear', C=C, cache_size=1000, verbose = True, max_iter = 1000, random_state=42)

    # Fit the model to the training data
    linear_svc.fit(X_train, y_train)

    # Evaluate the model on the testing data and print the accuracy score
    accuracy = linear_svc.score(X_test, y_test)
    print("Linear SVM with C=%s: %.2f%%" % (C, accuracy * 100))

[LibSVM]Linear SVM with C=0.01: 33.22%
[LibSVM]Linear SVM with C=0.1: 29.34%
[LibSVM]Linear SVM with C=1: 67.12%
[LibSVM]Linear SVM with C=10: 36.84%
[LibSVM]Linear SVM with C=50: 56.95%
[LibSVM]Linear SVM with C=100: 38.46%
[LibSVM]Linear SVM with C=500: 66.11%
[LibSVM]Linear SVM with C=1000: 66.11%


C = 1 gives better accuracy of 67.12%

##### Using Feature Selection

In [14]:
# Perform feature selection
k = 10  # number of top features to select
selector = SelectKBest(f_classif, k=k)
selector.fit(X_train, y_train)
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)

In [15]:
C_values = [0.01, 0.1, 1, 10, 50, 100, 500, 1000]

for C in C_values:
    linear_svc = SVC(kernel='linear', C=C, cache_size=1000, verbose = True, max_iter = 10000, random_state=42)

    # Fit the model to the training data
    linear_svc.fit(X_train_new, y_train)

    # Evaluate the model on the testing data and print the accuracy score
    accuracy = linear_svc.score(X_test_new, y_test)
    print("Linear SVM with C=%s: %.2f%%" % (C, accuracy * 100))

[LibSVM]Linear SVM with C=0.01: 84.21%
[LibSVM]Linear SVM with C=0.1: 84.32%
[LibSVM]Linear SVM with C=1: 83.26%
[LibSVM]Linear SVM with C=10: 50.29%
[LibSVM]Linear SVM with C=50: 60.30%
[LibSVM]Linear SVM with C=100: 34.08%
[LibSVM]Linear SVM with C=500: 39.30%
[LibSVM]Linear SVM with C=1000: 60.98%


Cost of 0.01 ands 0.1 gives more accuracy of 84.21%

In [None]:
linear_svc = SVC(kernel='linear', C=0.1, cache_size=1000, verbose = True, max_iter = 10000, random_state=42)

# Fit the model to the training data
linear_svc.fit(X_train_new, y_train)

# Evaluate the model on the testing data and print the accuracy score
accuracy = linear_svc.score(X_test_new, y_test)
print("Linear SVM with C=0.1: %.2f%%" % (accuracy * 100))

result = permutation_importance(linear_svc, X_train_new, y_train, n_repeats=100000, random_state=42)
sorted_idx = result.importances_mean.argsort()

[LibSVM]Linear SVM with C=0.1: 84.32%


In [None]:
# Plot feature importance
plt.barh(range(X.shape[1]), result.importances_mean[sorted_idx])
plt.yticks(range(X.shape[1]), X.columns[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()

# Plot decision boundary
h = .02  # step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = svm_model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary')
plt.show()

### RADIAL

In [None]:
rbf_svc = SVC(kernel='rbf', cache_size=1000, max_iter=10000, random_state=42)
rbf_svc.fit(X_train, y_train)

y_pred_rbf = rbf_svc.predict(X_test)

accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

print('Accuracy with radial kernel:', accuracy_rbf)

In [None]:
# Different cost values
C_values = [0.01, 0.1, 1, 10, 100, 1000]

for c in C_values:
    rbf_svc = SVC(kernel='rbf', C=c, cache_size=1000, verbose=True, max_iter=10000, random_state=42)

    # Fit the model to the training data
    rbf_svc.fit(X_train, y_train)

    # Evaluate the model on the testing data and print the accuracy score
    accuracy = rbf_svc.score(X_test, y_test)
    print("RBF SVM with C=%s: %.2f%%" % (c, accuracy * 100))
