In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set_style("dark")
import time

In [2]:
train = pd.read_csv("../data/train_small.csv")

In [3]:
train.columns

Index(['temperature', 'humidity', 'light', 'co2', 'humidityratio', 'target'], dtype='object')

In [4]:
cols = ['temperature', 'humidity', 'light', 'co2', 'humidityratio']

In [5]:
#train = train[cols]
x_train = train[cols]
y_train = train["target"]

In [6]:
sizes = np.linspace(0.1,1,10)
sizes = list(sizes)
sizes

[0.1,
 0.2,
 0.30000000000000004,
 0.4,
 0.5,
 0.6,
 0.7000000000000001,
 0.8,
 0.9,
 1.0]

In [7]:
cols[:len(cols)+1]

['temperature', 'humidity', 'light', 'co2', 'humidityratio']

In [8]:
sizes[0]

0.1

In [9]:
numbers = np.random.randint(0,high=len(train), size=round(len(train)*sizes[0]))

In [10]:
typ = "linear"
model = SVC(kernel=typ)
size1 = pd.DataFrame()
for i in range(len(sizes)):
    numbers = np.random.randint(0,high=len(train), size=round(len(train)*sizes[i]))
    new_x = x_train.iloc[numbers,:].reset_index(drop=True)
    new_y = y_train.iloc[numbers]
    start = time.time()
    model.fit(new_x, new_y)
    stop = time.time()
    elapsed=stop-start
    size1.loc[i, "size"] = sizes[i]*len(train)
    size1.loc[i, "model"] = "SVC_linear"
    size1.loc[i, "runtime"] = elapsed
    #size1.loc[i,"kernel"] = typ

In [11]:
size1

Unnamed: 0,size,model,runtime
0,157.8,SVC_linear,0.003872
1,315.6,SVC_linear,0.003944
2,473.4,SVC_linear,0.004671
3,631.2,SVC_linear,0.007923
4,789.0,SVC_linear,0.010077
5,946.8,SVC_linear,0.012252
6,1104.6,SVC_linear,0.01286
7,1262.4,SVC_linear,0.016449
8,1420.2,SVC_linear,0.01581
9,1578.0,SVC_linear,0.023595


In [12]:
typ = "rbf"
model = SVC(kernel=typ)
size2 = pd.DataFrame()
for i in range(len(sizes)):
    numbers = np.random.randint(0,high=len(train), size=round(len(train)*sizes[i]))
    new_x = x_train.iloc[numbers,:].reset_index(drop=True)
    new_y = y_train.iloc[numbers]
    start = time.time()
    model.fit(new_x, new_y)
    stop = time.time()
    elapsed=stop-start
    size2.loc[i, "size"] = sizes[i]*len(train)
    size2.loc[i, "model"] = "SVC_rbf"
    size2.loc[i, "runtime"] = elapsed
    #size2.loc[i,"kernel"] = typ

In [13]:
size2

Unnamed: 0,size,model,runtime
0,157.8,SVC_rbf,0.003669
1,315.6,SVC_rbf,0.003811
2,473.4,SVC_rbf,0.005566
3,631.2,SVC_rbf,0.008467
4,789.0,SVC_rbf,0.009884
5,946.8,SVC_rbf,0.013086
6,1104.6,SVC_rbf,0.013914
7,1262.4,SVC_rbf,0.016603
8,1420.2,SVC_rbf,0.021652
9,1578.0,SVC_rbf,0.025212


In [14]:
typ = "linear"
model = SVC(kernel=typ)
feat = pd.DataFrame()
for i in range(len(cols)):
    new_x = x_train.loc[:,cols[:i+1]]
    start = time.time()
    model.fit(new_x, y_train)
    stop = time.time()
    elapsed=stop-start
    feat.loc[i, "num_features"] = i + 1
    feat.loc[i, "model"] = "SVC_linear"
    feat.loc[i, "runtime"] = elapsed
    #feat.loc[i,"kernel"] = typ

In [15]:
feat

Unnamed: 0,num_features,model,runtime
0,1.0,SVC_linear,0.055846
1,2.0,SVC_linear,0.047085
2,3.0,SVC_linear,0.022382
3,4.0,SVC_linear,0.017987
4,5.0,SVC_linear,0.021519


In [16]:
typ = "rbf"
model = SVC(kernel=typ)
feat2 = pd.DataFrame()
for i in range(len(cols)):
    new_x = x_train.loc[:,cols[:i+1]]
    start = time.time()
    model.fit(new_x, y_train)
    stop = time.time()
    elapsed=stop-start
    feat2.loc[i, "num_features"] = i + 1
    feat2.loc[i, "model"] = "SVC_rbf"
    feat2.loc[i, "runtime"] = elapsed
    #feat2.loc[i,"kernel"] = typ

In [17]:
feat2

Unnamed: 0,num_features,model,runtime
0,1.0,SVC_rbf,0.120643
1,2.0,SVC_rbf,0.078686
2,3.0,SVC_rbf,0.019115
3,4.0,SVC_rbf,0.019261
4,5.0,SVC_rbf,0.023826


In [18]:
size_df = pd.concat([size1, size2], axis = 0).reset_index(drop=True)
size_df.to_csv("../results/runtime_size/svc.csv", index=False)

In [19]:
feat_df = pd.concat([feat, feat2], axis = 0).reset_index(drop=True)
feat_df.to_csv("../results/runtime_features/svc.csv", index=False)