In [1]:
import csv
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
def read_points(dataset_name, label_col_number):
    points = []
    labels = []
    with open (dataset_name, 'r') as fin:
        csvReader = csv.reader(fin, delimiter=',')
        for row in csvReader:
            if csvReader.line_num != 1:
                float_row = []
                for i in range(0, len(row)):
                    if i != label_col_number:
                        float_row.append(float(row[i]))
                points.append(float_row)
                labels.append(row[label_col_number])
    return np.array(points), np.array(labels)

In [3]:
def split(points, labels):
    validate_indexes = random.sample(range(len(points)), len(points) // 5)
    train_indexes = set([i for i in range(len(points))]) - set(validate_indexes)
    return [points[i] for i in train_indexes], [labels[i] for i in train_indexes], [points[i] for i in validate_indexes], [labels[i] for i in validate_indexes]

In [4]:
def foo(clf, train_points, train_labels):
    clf.fit(train_points, train_labels)

In [5]:
import multiprocessing

from time import time
from time import sleep
def chech_time(clf, train_points, train_labels, validate_points, validate_labels):
    p = multiprocessing.Process(target=clf.fit, args=(train_points, train_labels))
    p.start()
    p.join(10)
    if p.is_alive():
        p.terminate()
        p.join()
        return 0, 0
    start = time()
    clf.fit(train_points, train_labels)
    end = time()
    accuracy = round(clf.score(validate_points, validate_labels) * 100)
    return accuracy, end - start

In [6]:
def add_accuracy(accuracy, time, accuracy_times, text):
    if accuracy == 0:
        text += ": time is more than 10 seconds."
    else:
        text += ": time = " + str(time) + " seconds."   
    if accuracy in accuracy_times:
        accuracy_times[accuracy].append(text)
    else:
        accuracy_times[accuracy]=[text]
    return accuracy_times

In [7]:
def compare_SVM_RandomForest_time(dataset_name, label_col_number):
    points, labels = read_points(dataset_name, label_col_number)
    train_points, train_labels, validate_points, validate_labels = split(points, labels)
    accuracy_times = {}
    
    accuracy, time = chech_time(SVC(C=1, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=1, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=1, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=1, linear")
    
    accuracy, time = chech_time(SVC(C=5, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=5, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=5, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=5, linear")
    
    accuracy, time = chech_time(SVC(C=10, kernel='poly', degree=2, gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=10, poly, deg=2")
    
    accuracy, time = chech_time(SVC(C=10, kernel='linear', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=10, linear")
    
    accuracy, time = chech_time(SVC(C=1, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=1, rbf")
    
    accuracy, time = chech_time(SVC(C=5, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=5, rbf")
    
    accuracy, time = chech_time(SVC(C=10, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=10, rbf")
    
    accuracy, time = chech_time(SVC(C=1000, kernel='rbf', gamma='auto'), train_points, train_labels, validate_points, validate_labels)
    accuracy_times = add_accuracy(accuracy, time, accuracy_times, "SVM, C=0.1, rbf")
    
    for i in range(1, 15):
        accuracy, time = chech_time(RandomForestClassifier(max_depth=i, n_estimators=10), train_points, train_labels, validate_points, validate_labels)
        accuracy_times = add_accuracy(accuracy, time, accuracy_times, "RandomForest, depth=" + str(i))
    
    for key in sorted(accuracy_times.keys()):
        if key != 0:
            print("Accuracy = " + str(key * 1.0 / 100))
        for elem in accuracy_times[key]:
            print("    " + elem)
        

In [8]:
compare_SVM_RandomForest_time("../Datasets/cancer.csv", 0)

    SVM, C=1, poly, deg=2: time is more than 10 seconds.
    SVM, C=5, poly, deg=2: time is more than 10 seconds.
    SVM, C=10, poly, deg=2: time is more than 10 seconds.
    SVM, C=10, linear: time is more than 10 seconds.
Accuracy = 0.56
    SVM, C=1, rbf: time = 0.0167980194092 seconds.
    SVM, C=5, rbf: time = 0.0227749347687 seconds.
    SVM, C=10, rbf: time = 0.0248939990997 seconds.
    SVM, C=0.1, rbf: time = 0.0266900062561 seconds.
Accuracy = 0.89
    RandomForest, depth=1: time = 0.0495510101318 seconds.
    RandomForest, depth=2: time = 0.0311138629913 seconds.
Accuracy = 0.92
    RandomForest, depth=3: time = 0.0355520248413 seconds.
Accuracy = 0.93
    SVM, C=1, linear: time = 2.42918300629 seconds.
    RandomForest, depth=9: time = 0.0577571392059 seconds.
    RandomForest, depth=11: time = 0.0444002151489 seconds.
    RandomForest, depth=14: time = 0.0352039337158 seconds.
Accuracy = 0.94
    SVM, C=5, linear: time = 7.07123208046 seconds.
    RandomForest, depth=5: t

In [9]:
compare_SVM_RandomForest_time("../Datasets/spam.csv", 57)

    SVM, C=1, poly, deg=2: time is more than 10 seconds.
    SVM, C=1, linear: time is more than 10 seconds.
    SVM, C=5, poly, deg=2: time is more than 10 seconds.
    SVM, C=5, linear: time is more than 10 seconds.
    SVM, C=10, poly, deg=2: time is more than 10 seconds.
    SVM, C=10, linear: time is more than 10 seconds.
Accuracy = 0.83
    RandomForest, depth=1: time = 0.050528049469 seconds.
Accuracy = 0.84
    SVM, C=1, rbf: time = 1.09848499298 seconds.
Accuracy = 0.85
    SVM, C=10, rbf: time = 1.42740702629 seconds.
    SVM, C=0.1, rbf: time = 1.49294018745 seconds.
Accuracy = 0.86
    SVM, C=5, rbf: time = 1.4339799881 seconds.
Accuracy = 0.89
    RandomForest, depth=2: time = 0.0498950481415 seconds.
Accuracy = 0.9
    RandomForest, depth=3: time = 0.0535809993744 seconds.
Accuracy = 0.92
    RandomForest, depth=4: time = 0.060446023941 seconds.
Accuracy = 0.93
    RandomForest, depth=5: time = 0.0767848491669 seconds.
    RandomForest, depth=6: time = 0.0896019935608 sec