In [1]:
import csv
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
def read_points(dataset_name, label_col_number):
    points = []
    labels = []
    with open (dataset_name, 'r') as fin:
        csvReader = csv.reader(fin, delimiter=',')
        for row in csvReader:
            if csvReader.line_num != 1:
                float_row = []
                for i in range(0, len(row)):
                    if i != label_col_number:
                        float_row.append(float(row[i]))
                points.append(float_row)
                labels.append(row[label_col_number])
    return np.array(points), np.array(labels)

In [3]:
def split(points, labels):
    validate_indexes = random.sample(range(len(points)), len(points) // 5)
    train_indexes = set([i for i in range(len(points))]) - set(validate_indexes)
    return [points[i] for i in train_indexes], [labels[i] for i in train_indexes], [points[i] for i in validate_indexes], [labels[i] for i in validate_indexes]

In [4]:
from time import time
def chech_time(clf, train_points, train_labels, validate_points, validate_labels):
    start = time()
    clf.fit(train_points, train_labels)
    end = time()
    accuracy = round(clf.score(validate_points, validate_labels) * 100)
    return accuracy, end - start

In [5]:
def add_accuracy(accuracy, time, accuracy_times, text):
    text += ": time = " + str(time) + " seconds."   
    if accuracy in accuracy_times:
        accuracy_times[accuracy].append(text)
    else:
        accuracy_times[accuracy]=[text]
    return accuracy_times

In [9]:
def compare_SVM_RandomForest_time(dataset_name, label_col_number):
    points, labels = read_points(dataset_name, label_col_number)
    train_points, train_labels, validate_points, validate_labels = split(points, labels)
    accuracy_times = {}
    
    accuracy, time = chech_time(SVC(C=0.01, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.01, poly, deg=2")
    print("trained for SVM, C=0.01, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.01, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.01, linear")
    print("trained for SVM, C=0.01, linear")
    
    accuracy, time = chech_time(SVC(C=0.001, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.001, poly, deg=2")
    print("trained for SVM, C=0.001, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.001, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.001, linear")
    print("trained for SVM, C=0.001, linear")
    
    accuracy, time = chech_time(SVC(C=0.0001, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.0001, poly, deg=2")
    print("trained for SVM, C=0.0001, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.0001, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.0001, linear")
    print("trained for SVM, C=0.0001, linear")
    
    accuracy, time = chech_time(SVC(C=1, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=1, rbf")
    print("trained for SVM, C=1, rbf")
    
    accuracy, time = chech_time(SVC(C=10, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=10, rbf")
    print("trained for SVM, C=10, rbf")
    
    for i in range(1, 15):
        accuracy, time = chech_time(RandomForestClassifier(max_depth=i, n_estimators=10), train_points, train_labels, validate_points, validate_labels)
        accuracy_times = add_accuracy(accuracy, time, accuracy_times, "RandomForest, depth=" + str(i))
    
    for key in sorted(accuracy_times.keys()):
        print("Accuracy = " + str(key * 1.0 / 100))
        for elem in accuracy_times[key]:
            print("    " + elem)
        

In [10]:
compare_SVM_RandomForest_time("../Datasets/cancer.csv", 0)

trained for SVM, C=0.01, poly, deg=2
trained for SVM, C=0.01, linear
trained for SVM, C=0.001, poly, deg=2
trained for SVM, C=0.001, linear
trained for SVM, C=0.0001, poly, deg=2
trained for SVM, C=0.0001, linear
trained for SVM, C=1, rbf
trained for SVM, C=10, rbf
Accuracy = 0.66
    SVM, C=1, rbf: time = 0.0122139453888 seconds.
    SVM, C=10, rbf: time = 0.0129868984222 seconds.
Accuracy = 0.92
    RandomForest, depth=1: time = 0.0239849090576 seconds.
Accuracy = 0.94
    SVM, C=0.001, linear: time = 0.00568222999573 seconds.
Accuracy = 0.95
    SVM, C=0.01, linear: time = 0.0323100090027 seconds.
    RandomForest, depth=7: time = 0.0399169921875 seconds.
    RandomForest, depth=13: time = 0.026535987854 seconds.
Accuracy = 0.96
    SVM, C=0.001, poly, deg=2: time = 4.93879008293 seconds.
    SVM, C=0.0001, poly, deg=2: time = 1.07050204277 seconds.
    SVM, C=0.0001, linear: time = 0.00377488136292 seconds.
    RandomForest, depth=2: time = 0.0238909721375 seconds.
    RandomForest

In [11]:
compare_SVM_RandomForest_time("../Datasets/spam.csv", 57)

trained for SVM, C=0.01, poly, deg=2
trained for SVM, C=0.01, linear
trained for SVM, C=0.001, poly, deg=2
trained for SVM, C=0.001, linear
trained for SVM, C=0.0001, poly, deg=2
trained for SVM, C=0.0001, linear
trained for SVM, C=1, rbf
trained for SVM, C=10, rbf
Accuracy = 0.76
    SVM, C=0.0001, linear: time = 0.905840158463 seconds.
Accuracy = 0.82
    SVM, C=0.0001, poly, deg=2: time = 10.0332610607 seconds.
    RandomForest, depth=1: time = 0.031347990036 seconds.
Accuracy = 0.85
    SVM, C=1, rbf: time = 0.849681854248 seconds.
    RandomForest, depth=2: time = 0.0319418907166 seconds.
Accuracy = 0.86
    SVM, C=0.001, linear: time = 1.33355903625 seconds.
Accuracy = 0.87
    SVM, C=10, rbf: time = 1.12715005875 seconds.
Accuracy = 0.88
    SVM, C=0.001, poly, deg=2: time = 1483.44936299 seconds.
Accuracy = 0.89
    RandomForest, depth=3: time = 0.036957025528 seconds.
Accuracy = 0.9
    SVM, C=0.01, linear: time = 8.16068792343 seconds.
    RandomForest, depth=4: time = 0.0392