In [1]:
import csv
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
def read_points(dataset_name, label_col_number):
    points = []
    labels = []
    with open (dataset_name, 'r') as fin:
        csvReader = csv.reader(fin, delimiter=',')
        for row in csvReader:
            if csvReader.line_num != 1:
                float_row = []
                for i in range(0, len(row)):
                    if i != label_col_number:
                        float_row.append(float(row[i]))
                points.append(float_row)
                labels.append(row[label_col_number])
    return np.array(points), np.array(labels)

In [3]:
def split(points, labels):
    validate_indexes = random.sample(range(len(points)), len(points) // 5)
    train_indexes = set([i for i in range(len(points))]) - set(validate_indexes)
    return [points[i] for i in train_indexes], [labels[i] for i in train_indexes], [points[i] for i in validate_indexes], [labels[i] for i in validate_indexes]

In [4]:
from time import time
def chech_time(clf, train_points, train_labels, validate_points, validate_labels):
    start = time()
    clf.fit(train_points, train_labels)
    end = time()
    accuracy = round(clf.score(validate_points, validate_labels) * 100)
    return accuracy, end - start

In [5]:
def add_accuracy(accuracy, time, accuracy_times, text):
    text += ": time = " + str(time) + " seconds."   
    if accuracy in accuracy_times:
        accuracy_times[accuracy].append(text)
    else:
        accuracy_times[accuracy]=[text]
    return accuracy_times

In [6]:
def compare_SVM_RandomForest_time(dataset_name, label_col_number):
    points, labels = read_points(dataset_name, label_col_number)
    train_points, train_labels, validate_points, validate_labels = split(points, labels)
    accuracy_times = {}
    
    accuracy, time = chech_time(SVC(C=0.0001, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.0001, poly, deg=2")
    print("trained for SVM, C=0.0001, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.0001, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.0001, linear")
    print("trained for SVM, C=0.0001, linear")
    
    accuracy, time = chech_time(SVC(C=0.00005, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.00005, poly, deg=2")
    print("trained for SVM, C=0.00005, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.00005, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.00005, linear")
    print("trained for SVM, C=0.00005, linear")
    
    accuracy, time = chech_time(SVC(C=0.00001, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.00001, poly, deg=2")
    print("trained for SVM, C=0.00001, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=0.00001, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.00001, linear")
    print("trained for SVM, C=0.00001, linear")
    
    accuracy, time = chech_time(SVC(C=1, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=1, rbf")
    print("trained for SVM, C=1, rbf")
    
    accuracy, time = chech_time(SVC(C=10, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=10, rbf")
    print("trained for SVM, C=10, rbf")
    
    for i in range(1, 15):
        accuracy, time = chech_time(RandomForestClassifier(max_depth=i, n_estimators=10), train_points, train_labels, validate_points, validate_labels)
        accuracy_times = add_accuracy(accuracy, time, accuracy_times, "RandomForest, depth=" + str(i))
    
    for key in sorted(accuracy_times.keys()):
        print("Accuracy = " + str(key * 1.0 / 100))
        for elem in accuracy_times[key]:
            print("    " + elem)
        

In [7]:
compare_SVM_RandomForest_time("../Datasets/cancer.csv", 0)

trained for SVM, C=0.0001, poly, deg=2
trained for SVM, C=0.0001, linear
trained for SVM, C=0.00005, poly, deg=2
trained for SVM, C=0.00005, linear
trained for SVM, C=0.00001, poly, deg=2
trained for SVM, C=0.00001, linear
trained for SVM, C=1, rbf
trained for SVM, C=10, rbf
Accuracy = 0.58
    SVM, C=1, rbf: time = 0.0147621631622 seconds.
    SVM, C=10, rbf: time = 0.0140430927277 seconds.
Accuracy = 0.88
    SVM, C=0.00001, linear: time = 0.00279712677002 seconds.
Accuracy = 0.91
    SVM, C=0.0001, linear: time = 0.00375819206238 seconds.
    SVM, C=0.00005, linear: time = 0.00410294532776 seconds.
    RandomForest, depth=1: time = 0.0267360210419 seconds.
Accuracy = 0.94
    RandomForest, depth=2: time = 0.025869846344 seconds.
    RandomForest, depth=12: time = 0.0269920825958 seconds.
Accuracy = 0.95
    RandomForest, depth=5: time = 0.0295979976654 seconds.
Accuracy = 0.96
    SVM, C=0.00005, poly, deg=2: time = 0.224935054779 seconds.
    SVM, C=0.00001, poly, deg=2: time = 0.0

In [8]:
compare_SVM_RandomForest_time("../Datasets/spam.csv", 57)

trained for SVM, C=0.0001, poly, deg=2
trained for SVM, C=0.0001, linear
trained for SVM, C=0.00005, poly, deg=2
trained for SVM, C=0.00005, linear
trained for SVM, C=0.00001, poly, deg=2
trained for SVM, C=0.00001, linear
trained for SVM, C=1, rbf
trained for SVM, C=10, rbf
Accuracy = 0.73
    SVM, C=0.00001, linear: time = 0.92790389061 seconds.
Accuracy = 0.75
    SVM, C=0.0001, linear: time = 0.967816114426 seconds.
    SVM, C=0.00005, linear: time = 0.850450992584 seconds.
Accuracy = 0.78
    SVM, C=0.00001, poly, deg=2: time = 30.7133409977 seconds.
Accuracy = 0.83
    SVM, C=0.00005, poly, deg=2: time = 12.4004840851 seconds.
    SVM, C=1, rbf: time = 0.872599124908 seconds.
Accuracy = 0.84
    SVM, C=0.0001, poly, deg=2: time = 14.7765800953 seconds.
    SVM, C=10, rbf: time = 1.47242879868 seconds.
Accuracy = 0.85
    RandomForest, depth=1: time = 0.0362639427185 seconds.
Accuracy = 0.88
    RandomForest, depth=2: time = 0.0348241329193 seconds.
Accuracy = 0.9
    RandomForest