In [34]:
import pandas as pd
import numpy as np
import time
import statistics

In [13]:
# %store -r ionosphere_headers
# %store -r ionosphere_export

In [14]:
%run naive_bayes.ipynb

In [15]:
%run logistic_regression.ipynb

In [16]:
# data = adult_data_export
# data_headers = adult_data_headers
# # data.shape

In [17]:
def get_new_model(model,termination_rounds=10000,learning_rate=0.05,use_gradient=False,eps=1e-2):
    if model == "LogisticRegression":
        return LogisticRegression(termination_rounds, learning_rate, use_gradient, eps)
    else:
        return NaiveBayes()

In [18]:
def evaluate_acc(y,yh):
    tp = np.sum(np.logical_and(yh==1,y==1))
    tn = np.sum(np.logical_and(yh==0,y==0))
    
    acc = (tp+tn)/y.size
    return acc

In [19]:
def get_folds(k, data):
    np.random.shuffle(data)
    
    # Split into k-folds and remove empty splits if not appropriate size
    folds = np.array_split(data, k)
    folds = [x for x in folds if x.size > 0]
    
    return folds

In [37]:
def k_fold_cross_validation(k, data, model_name="LogisticRegression", data_headers=[], model_args=[]):
    folds = get_folds(k,data)
    fold_accuracies = []
    all_fit_times = []
    all_predict_times = []
    all_total_times = []
    
    for i in range(k):
        model = get_new_model(model_name,*model_args)
        
        # Get test row, and training set
        test = folds[i]
        train = np.delete(folds, i, 0)
        
        train = np.concatenate(train,axis=0)
        train = np.array(train)
        
        train_X = train[:, :-1]
        train_Y = train[:, -1]
        
        # Start total times
        start_total_times = time.time()      
        
        
        # Fit and predict
        if(model_name == "NaiveBayes"):
            model.set_headers(data_headers)
            
        # Start fit time
        start_fit_time = time.time()
        model.fit(train_X, train_Y)
        all_fit_times.append(time.time() - start_fit_time)
        
        test_X = test[:, :-1]
        test_Y = test[:, -1]
        
        start_predict_time = time.time()
        model_results = model.predict(test_X)
        
        all_predict_times.append(time.time() - start_predict_time)
        all_total_times.append(time.time() - start_total_times)
        
        fold_accuracies.append(evaluate_acc(model_results,test_Y))
    
    to_print = [
        ["Fit","{:.4f}".format(statistics.mean(all_fit_times)),"{:.4f}".format(max(all_fit_times)),"{:.4f}".format(min(all_fit_times))],
        ["Predict","{:.4f}".format(statistics.mean(all_predict_times)),"{:.4f}".format(max(all_predict_times)),"{:.4f}".format(min(all_predict_times))],
        ["Total","{:.4f}".format(statistics.mean(all_total_times)),"{:.4f}".format(max(all_total_times)),"{:.4f}".format(min(all_total_times))]
    ]
    df = pd.DataFrame(to_print, columns=["Type", "Average Time (s)","Maximum Time (s)","Minimum Time (s)"])
    print(df)
    return np.average(fold_accuracies)

In [21]:
# k_fold_cross_validation(10,data, model_name="NaiveBayes", data_headers=data_headers)

In [22]:
# # np.random.shuffle(adult_data_export)

# train_X = ionosphere_export[:300, :-1]
# train_y = ionosphere_export[:300, -1]

# test_X = ionosphere_export[300:, :-1]
# test_y = ionosphere_export[300:, -1]

# lr = LogisticRegression(100000)
# lr.fit(train_X, train_y)

# y_pred = lr.predict(test_X)
# lr1_acc = evaluate_acc(y_pred, test_y)


In [23]:
# lr2 = linear_model.LogisticRegression()
# lr2.fit(train_X, train_y)

# print(lr2.coef_) # returns a matrix of weights (coefficients)'

# y_pred = lr2.predict(test_X)
# lr2_acc = evaluate_acc(y_pred, test_y)

# print("lr1: ", lr1_acc)
# print("lr2: ", lr2_acc)
