In [1]:
# Importing required libraries
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./smoke.csv') #importing dataset
df.head() # displaying first five records

Unnamed: 0.1,Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,0,1654733331,20.0,57.36,0,400,12306,18520,939.735,0.0,0.0,0.0,0.0,0.0,0,0
1,1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.0,0.0,0.0,0.0,0.0,1,0
2,2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.0,0.0,0.0,0.0,0.0,2,0
3,3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.0,0.0,0.0,0.0,0.0,3,0
4,4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.0,0.0,0.0,0.0,0.0,4,0


In [3]:
# replacing null values with 0
df = df.fillna(value= 0)

In [4]:
# Shuffling dataset ( to remove any biases in the dataset if any )
df = df.sample(frac=1)

In [5]:
new_df=df[['UTC', 'Temperature[C]', 'Humidity[%]', 'TVOC[ppb]', 'eCO2[ppm]', 'Raw H2','Raw Ethanol', 'Pressure[hPa]', 'PM1.0', 'PM2.5', 'NC0.5', 'NC1.0', 'NC2.5', 'CNT', 'Fire Alarm']]
new_df.head()

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
14816,1654748147,13.856,53.38,1136,465,12857,19440,938.841,1.64,1.7,11.26,1.756,0.04,14816,1
31129,1654767478,21.67,49.76,145,400,13185,20120,939.688,1.36,1.41,9.33,1.454,0.033,6135,1
26307,1654762656,17.23,49.77,0,400,13105,19988,939.719,0.46,0.48,3.19,0.497,0.011,1313,0
50979,1654903996,33.33,17.47,0,1050,13005,20846,930.926,0.97,1.0,6.65,1.037,0.023,991,1
430,1654733761,24.831,50.81,29,400,12720,19731,939.844,0.15,0.18,0.94,0.194,0.032,430,0


In [6]:
# converting type from object to string to be able to apply Label Encoder
new_df['Fire Alarm']=new_df['Fire Alarm'].astype(dtype='string',copy=True)

In [7]:
# converting catagorical column 'Fire Alarm' to numerical column by LabelEncoding
new_df['Fire Alarm'] = LabelEncoder().fit_transform(new_df['Fire Alarm'])

In [8]:
# changing data type of whole dataset to int
new_df = new_df.astype(int)

In [9]:
# splitting dataset into 70:30 ratio

# Defining train size
train_size = int(0.7 * len(new_df))

# Splitting dataset
train_set = new_df[:train_size]
test_set = new_df[train_size:]

In [10]:
# separating train_set into X and Y
X_train=train_set.drop('Fire Alarm', axis=1)
y_train=train_set['Fire Alarm']

# separating test_set into X and Y
X_test=test_set.drop('Fire Alarm', axis=1)
y_test=test_set['Fire Alarm']

In [11]:
class SVM:

	def __init__(self, kernel = "linear", learning_rate = 1e-4, regularization_strength = 1.0, max_iter = 2000):
 
		self.num_feats = int
		self.train_size = int
		self.weights = np.array 
		self.y_train = np.array 
		self.input_matrix = np.array

		self.kernel = kernel
		self.kernel_matrix = np.array
		self.support_vectors = np.array
		self.learning_rate = learning_rate 	# Learning rate for gradient descent
		self.regularization_strength = regularization_strength 	# Regularization parameter, to control bias-variance tradeoff
		self.max_iter = max_iter	# Maximum Number of iterations to run gradient descent
		self.cost_threshold = 0.1 * learning_rate  # stopping criterion for gradient descent

	def fit(self, X, y):

		""" Adjust weights to training data """

		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]
		self.input_matrix = np.append(X, np.ones(self.train_size).reshape(-1, 1), axis = 1)   #Add Column with Ones for intercept term 
		self.y_train = np.where(y == 0, -1, 1)
		self.weights = np.zeros(self.num_feats + 1) #Extra +1 for the intercept

		# optimize weights
		prev_cost = float("inf")
		for i in range(self.max_iter):
			cost = self._update_weights()
			
			if i%100 ==0 or i == self.max_iter:
				print("Cost after {} iterations is: {}".format(i, cost))
			if abs(prev_cost -cost) < self.cost_threshold*prev_cost:
				print("Cost after {} iterations is: {}".format(i, cost))
				break
			prev_cost = cost

	def _update_weights(self):

		"""
			Cost Function:
				l(w) = sum(max(0, 1 - y(wX + b))) + (lambda/2)(||w||)^2
				First Term is Hinge Loss, Second is Regularization term
			Gradient:
			    delta_w = dl/dw = (1/n) * ( if y(wX+b) < 1: -yX + lambda*w else: lambda*w )
			Gradient Descent
				w = w - (learning_rate * delta_w)
		"""
		y_pred = (self.weights * self.input_matrix).sum(axis = 1) # y_pred = wX+b


		dist = (1 - (self.y_train * y_pred))
		dist[dist < 0] = 0
		hinge_loss = np.sum(dist)/self.train_size
		regularization_term = (1/2) * self.regularization_strength * (np.dot(self.weights, self.weights))
		cost = hinge_loss  + regularization_term

		delta_w =  self.regularization_strength * (self.weights)

		for y, yhat, X in zip(self.y_train, y_pred, self.input_matrix):

			if y*yhat < 1:
				delta_w -= y*X

		delta_w /= self.train_size
		self.weights = self.weights - (self.learning_rate * delta_w)

		return cost

	def predict(self, X):

		""" Make predictions on given X using trained model """

		size = X.shape[0]
		X = np.append(X, np.ones(size).reshape(-1, 1), axis = 1)

		y_pred = np.sign((self.weights * X).sum(axis = 1))

		y_pred[np.where(y_pred == -1)] = 0.0

		return y_pred 

In [12]:
svm_clf = SVM(learning_rate = 1e-5, regularization_strength = 10.5)

svm_clf.fit(X_train, y_train)

Cost after 0 iterations is: 1.0
Cost after 100 iterations is: 875014370705.9464
Cost after 200 iterations is: 3990570685204.454
Cost after 300 iterations is: 1814433932081.0535
Cost after 400 iterations is: 2388469326843.182
Cost after 500 iterations is: 5504246675802.76
Cost after 600 iterations is: 786488104084.8137
Cost after 700 iterations is: 3902031514742.0396
Cost after 800 iterations is: 2035409992427.797
Cost after 900 iterations is: 2299936845750.3525
Cost after 1000 iterations is: 5415701197063.265
Cost after 1100 iterations is: 697962218560.6111
Cost after 1200 iterations is: 3813492725378.156
Cost after 1300 iterations is: 2256386391164.7305
Cost after 1400 iterations is: 2211404726994.403
Cost after 1500 iterations is: 5327156099423.418
Cost after 1600 iterations is: 609436714133.1702
Cost after 1700 iterations is: 3724954317112.6562
Cost after 1800 iterations is: 2477363128291.633
Cost after 1900 iterations is: 2122872914294.6018


In [13]:
print("Coefficients: {}".format(svm_clf.weights[:-1]))

Coefficients: [ 1.10667227e+04 -2.08004663e-02  3.25390442e-02 -1.51859018e+01
 -1.67441061e+00  3.42750048e-01 -1.73941696e+00  9.38540763e-03
 -9.17467643e-01 -1.51048795e+00 -4.91513774e+00 -1.65124016e+00
 -5.63054299e-01  4.65107888e+01]


In [14]:
print("Intercept: {}".format(svm_clf.weights[-1]))

Intercept: 6.923644977691104e-06


In [15]:
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [16]:
print("Train Accuracy: {}".format(accuracy_score(y_train, svm_clf.predict(X_train))))
print("Test Accuracy: {}".format(accuracy_score(y_test, svm_clf.predict(X_test))))

Train Accuracy: 71.4
Test Accuracy: 71.61
