In [3]:
import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
import sys

from statistics import mean
from tqdm.auto import tqdm
from multiprocessing import Pool

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.model_selection import StratifiedKFold
'''
Perform cross-validation of sklearn classifier on training data samples 

Input:
   
    clf: sklearn classifier object
    X: x values
    y: y values
    topk: k values for evaluation metrics
    n_splits: Number of folds.
Output:
    list scores, one for each fold, where each score is of length topk with accuracy for validate data
'''

def cross_validate(clf,X,y, topk=[1,3,5],n_splits=5):
    
    #xx = X[0,:30]
    #print(xx)
    #yy = y[:30]
    #print(yy)
    skf = StratifiedKFold(n_splits=n_splits)

    #yyy = []
    cv_score_list = []    
    for train_index, validate_index in skf.split(X,y):
        #print("\n\n\n=================\n","train_index",train_index)
        #print("\n\n\n=================\n","validate_index",validate_index)

        X_train, X_validate = X[train_index], X[validate_index]
        #y_train, y_validate = y[train_index], y[validate_index]
        
        
        
        y_train = [y[j] for j in train_index]
        #print(X[train_index].shape,len(y_train))
        
        y_validate = [y[j] for j in validate_index]
        #print("\ny_validate:",y_validate)
        
        #print("\n")

        clf.fit(X_train,y_train)
        y_prob = clf.predict_proba(X_validate)

        #print(y_prob.shape)


        cv_scores = []
        for k in topk:
            correct = 0
            for i in range(len(y_prob)):
                ind = np.argpartition(y_prob[i], -k)[-k:]
                if y_validate[i] in ind:
                    correct += 1
            #print(correct/len(y_prob))
            cv_scores.append(correct/len(y_prob))

        cv_score_list.append(cv_scores)

    return cv_score_list

In [5]:

'''
Evaluate performance of sklearn classifier on data samples - 90/10 training testing split

Input:
    
    clf: sklearn classifier object
    X: x values
    y: y values
    topk: k values for evaluation metrics
    bCrossValidate: A boolean variable defining if cross-validation is required
    n_splits: Number of cross-validation folds
Output:
    list of length topk with accuracy for testing data
    list scores, one for each fold, where each score is of length topk with accuracy for validate data, return -1 if bCrossValidate = False
'''

def classifier_performance(clf, X, y, topk=[1,3,5],bCrossValidate=True, n_splits=5):
    
    #print(type(X),type(y))
    #print(X.shape,len(y))
    cv_score_list = -1
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    #print("X_train.shape",X_train.shape,"X_test.shape",X_test.shape,"y_train",len(y_train),"y_test.shape",len(y_test))
    
    if bCrossValidate == True:
        cv_score_list = cross_validate(clf,X_train,y_train,topk,n_splits)
        #print("cv_score_list", cv_score_list)
    
    #print("\n******************\ncross validation ends here\n\n\n")
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)
    
    #print(y_prob.shape)
    
    scores = []
    for k in topk:
        correct = 0
        for i in range(len(y_prob)):
            ind = np.argpartition(y_prob[i], -k)[-k:]
            if y_test[i] in ind:
                correct += 1
        scores.append(correct/len(y_prob))
    
    return cv_score_list, scores

In [6]:
import os

parameter_list = [[3,1,400000]]

print(parameter_list)

for parameters in parameter_list:
    
    
    filename = '../results/data/X/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    with open(filename, 'rb') as f:
        X = pickle.load(f)
    
    
    filename = '../results/data/y/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    with open(filename, 'rb') as f:
        y = pickle.load(f)
    
    bCrossValidate = True
    Num_Instance = 399999
    n_splits = 5
    
    clf = RandomForestClassifier()
    cv_score_list, scores = classifier_performance(clf, X[:Num_Instance,:], y[:Num_Instance],bCrossValidate=bCrossValidate)
    
    # To run on entire dataset, replace the above line with the following
    #cv_score_list, scores = classifier_performance(clf, X, y,bCrossValidate=bCrossValidate,n_splits=n_splits)
    
    print("",cv_score_list,"\n",scores)
    print("===========================================")
    
    
    #outfile_name = "../results/scores/" + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    
    #with open(outfile_name, 'wb') as f:
    #    pickle.dump(scores, f)

[[3, 1, 400000]]
 [[0.8765, 0.9483611111111111, 0.9673055555555555], [0.8795, 0.9491944444444445, 0.9681388888888889], [0.8776805555555556, 0.9479027777777778, 0.9668194444444445], [0.8781666666666667, 0.9491944444444445, 0.9683472222222222], [0.8761232794899929, 0.9494020750288198, 0.9683467825942027]] 
 [0.896075, 0.957625, 0.97385]
