Importing the Libraries

In [392]:
import pandas as pd
from sklearn import svm
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn import metrics  

Read the Data File

In [393]:
read_data = pd.read_csv("/content/WIFIDATAcsv1.csv",nrows=200)

List of features in the dataset to be consider

In [394]:
applicable_features = [  
                       "ssid_name",
                       "phy_type_id",
                       "capabilities",
                       "channel_center_freq_khz",
                       "connection_mode",
                       "authentication",
                       "encryption",
                       "vendor_name",
                       "bssid",
                       "mac",
                        "result" ]

Filter out the above features in the dataset

In [395]:
read_data = read_data[applicable_features]
read_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ssid_name                200 non-null    object 
 1   phy_type_id              200 non-null    int64  
 2   capabilities             200 non-null    int64  
 3   channel_center_freq_khz  200 non-null    int64  
 4   connection_mode          200 non-null    object 
 5   authentication           200 non-null    object 
 6   encryption               200 non-null    object 
 7   vendor_name              200 non-null    object 
 8   bssid                    200 non-null    object 
 9   mac                      200 non-null    float64
 10  result                   200 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 17.3+ KB


Review of Dataset

In [396]:
read_data.head()

Unnamed: 0,ssid_name,phy_type_id,capabilities,channel_center_freq_khz,connection_mode,authentication,encryption,vendor_name,bssid,mac,result
0,Shaun 2.4GHz,8,5393,5785000,auto,WPA2PSK,AES,Unknown,70037E3BE342,73500000000000.0,1
1,CB,7,5425,2462000,auto,WPA2,AES,"Officially Xerox, but 0:0:0:0:0:0 is more common",889E6846D879,134000000000000.0,1
2,SHAW-5C0570,7,3121,2462000,manual,open,none,XEROX CORPORATION,D8B6B72A8103,3850000000000.0,1
3,Madden Family,7,1041,2412000,auto,WPA2PSK,AES,OMRON TATEISI ELECTRONICS CO.,E4186BBE1136,59400000000000.0,1
4,Weeder 95,7,5425,2462000,auto,open,none,MATRIX CORPORATION,1033BFF55C6F,122000000000000.0,1


Feature hashing is used to reduce the dimensionality and handle sparsity of bit vectors.This will work per single Series, column name will be assumed to be a Pandas index. It also replaces blank strings with nan, and floats everything.

In [397]:
data = pd.DataFrame(read_data['authentication'])

def hash_col(df, col, N):
    cols = [col + "_" + str(i) for i in range(N)]
    def xform(x): tmp = [0 for i in range(N)]; tmp[hash(x) % N] = 1; return pd.Series(tmp,index=cols)
    df[cols] = df[col].apply(xform)
    return df.drop(col,axis=1)

print(hash_col(data, 'authentication',8))
read_data = pd.concat([read_data,data],axis=1)
print(read_data)
print(read_data.info())

     authentication_0  authentication_1  ...  authentication_6  authentication_7
0                   0                 0  ...                 0                 1
1                   0                 0  ...                 0                 0
2                   0                 0  ...                 0                 0
3                   0                 0  ...                 0                 1
4                   0                 0  ...                 0                 0
..                ...               ...  ...               ...               ...
195                 0                 0  ...                 0                 1
196                 0                 0  ...                 0                 0
197                 0                 0  ...                 0                 0
198                 0                 0  ...                 0                 1
199                 0                 0  ...                 0                 1

[200 rows x 8 columns]
    

In [398]:
data1 = pd.DataFrame(read_data['encryption'])

def hash_col(df, col, N):
    cols = [col + "_" + str(i) for i in range(N)]
    def xform(x): tmp = [0 for i in range(N)]; tmp[hash(x) % N] = 1; return pd.Series(tmp,index=cols)
    df[cols] = df[col].apply(xform)
    return df.drop(col,axis=1)

print(hash_col(data1, 'encryption',5))
read_data = pd.concat([read_data,data1],axis=1)

     encryption_0  encryption_1  encryption_2  encryption_3  encryption_4
0               0             0             0             1             0
1               0             0             0             1             0
2               0             0             0             1             0
3               0             0             0             1             0
4               0             0             0             1             0
..            ...           ...           ...           ...           ...
195             0             0             0             1             0
196             0             0             0             1             0
197             0             0             0             1             0
198             0             0             0             1             0
199             0             0             0             1             0

[200 rows x 5 columns]


In [399]:
data2 = pd.DataFrame(read_data['connection_mode'])

def hash_col(df, col, N):
    cols = [col + "_" + str(i) for i in range(N)]
    def xform(x): tmp = [0 for i in range(N)]; tmp[hash(x) % N] = 1; return pd.Series(tmp,index=cols)
    df[cols] = df[col].apply(xform)
    return df.drop(col,axis=1)

print(hash_col(data2, 'connection_mode',8))
read_data = pd.concat([read_data,data2],axis=1)

     connection_mode_0  connection_mode_1  ...  connection_mode_6  connection_mode_7
0                    0                  0  ...                  0                  0
1                    0                  0  ...                  0                  0
2                    0                  0  ...                  0                  0
3                    0                  0  ...                  0                  0
4                    0                  0  ...                  0                  0
..                 ...                ...  ...                ...                ...
195                  0                  0  ...                  0                  0
196                  0                  0  ...                  0                  0
197                  0                  0  ...                  0                  0
198                  0                  0  ...                  0                  0
199                  0                  0  ...                  0

Specifying the target data to detect outliers.

In [400]:
target=read_data['result']
outliers = target[target == 0]  
print("outliers.shape", outliers.shape)  
print("outlier fraction", outliers.shape[0]/target.shape[0])

read_data.info()

outliers.shape (77,)
outlier fraction 0.385
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ssid_name                200 non-null    object 
 1   phy_type_id              200 non-null    int64  
 2   capabilities             200 non-null    int64  
 3   channel_center_freq_khz  200 non-null    int64  
 4   connection_mode          200 non-null    object 
 5   authentication           200 non-null    object 
 6   encryption               200 non-null    object 
 7   vendor_name              200 non-null    object 
 8   bssid                    200 non-null    object 
 9   mac                      200 non-null    float64
 10  result                   200 non-null    int64  
 11  authentication           200 non-null    object 
 12  authentication_0         200 non-null    int64  
 13  authentication_1         200 non-nul

Adding the Feature hashing vectors to the features.

In [401]:
read_data.info()
applicable_features1 = [  
                      
                       "authentication_0",
                       "authentication_1",
                       "authentication_3",
                       "authentication_3",
                       "authentication_4",
                      "authentication_5",
                      "authentication_6",
                       "authentication_7",
                       "encryption_0",
                        "encryption_1",
                        "encryption_2",
                        "encryption_3",
                        "encryption_4",
                        "connection_mode_0",
                        "connection_mode_1",
                        "connection_mode_2",
                       "phy_type_id",
                       "capabilities",
                       "channel_center_freq_khz",
                       "mac",
                        "result" ]
#Normalize the data for effieciency
read_data['channel_center_freq_khz'] = read_data['channel_center_freq_khz'].astype(float)
read_data['mac'] = read_data['mac'].astype(float)
read_data['phy_type_id'] = read_data['phy_type_id'].astype(float)
read_data['capabilities'] = read_data['capabilities'].astype(float)
read_data['result'] = read_data['result'].astype(float)
read_data = read_data[applicable_features1]
read_data.info()

#Drop the target feature
read_data.drop(["result"], axis=1, inplace=True)
read_data.shape  
print(read_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ssid_name                200 non-null    object 
 1   phy_type_id              200 non-null    int64  
 2   capabilities             200 non-null    int64  
 3   channel_center_freq_khz  200 non-null    int64  
 4   connection_mode          200 non-null    object 
 5   authentication           200 non-null    object 
 6   encryption               200 non-null    object 
 7   vendor_name              200 non-null    object 
 8   bssid                    200 non-null    object 
 9   mac                      200 non-null    float64
 10  result                   200 non-null    int64  
 11  authentication           200 non-null    object 
 12  authentication_0         200 non-null    int64  
 13  authentication_1         200 non-null    int64  
 14  authentication_2         2

One Class Classification Model

In [402]:
#Split the training and test data using the train_test_split function
train_data, test_data, train_target, test_target = train_test_split(read_data, target, train_size = 0.8)  
train_data.shape  
print(test_data)

# set nu (which should be the proportion of outliers in our dataset)
nu = outliers.shape[0] / target.shape[0]  
print("The calculated values of nu is:", nu)

#Run the model for training
model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.00005)  
model.fit(train_data)  

#Predict the Values
values_preds = model.predict(train_data)  
values_targs = train_target

print("Training DataSET accuracy: ", 100 *  metrics.accuracy_score(values_targs, values_preds))
values_preds = model.predict(test_data)
print(values_preds)
#Reviewing the Test Target Data
print(test_target)
values_targs = test_target
print("Test DataSet Accuracy: ", 100 * metrics.accuracy_score(values_targs, values_preds))

#Store the output in the file
outputfile = 'one_class_svm_2.model'
from sklearn.externals import joblib  
joblib.dump(model, outputfile, compress=9) 

     authentication_0  authentication_1  ...  channel_center_freq_khz           mac
26                  0                 0  ...                2462000.0  2.040000e+12
121                 1                 0  ...                2462000.0  2.020000e+14
45                  0                 0  ...                2462000.0  1.010000e+14
53                  0                 0  ...                2600000.0  1.610000e+14
149                 0                 0  ...                2412000.0  1.220000e+14
154                 0                 0  ...                2462000.0  1.210000e+14
173                 0                 0  ...                2412000.0  2.730000e+14
140                 0                 0  ...                2462000.0  1.260000e+14
58                  0                 0  ...                2600000.0  1.480000e+14
193                 0                 0  ...                2437000.0  1.500000e+13
123                 0                 0  ...                2437000.0  2.470

['one_class_svm_2.model']