Reference:
https://machinelearningmastery.com/how-to-configure-k-fold-cross-validation/

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold

https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/

https://www.kaggle.com/code/muhammetvarl/mlp-multiclass-classification-roc-auc/notebook

In [17]:
import pandas as pd
from smartlawdata import getSentenceTypeDataSet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
df_final1 = getSentenceTypeDataSet()
#print(df_final1)

los=[]
for item in df_final1['text']:
    los.append(item)

#Create a TFIDF vectorizer to generate text entered into vector form to be given as input to Machine Learning model
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(los)
feature_names = vectorizer.get_feature_names_out() #Extract the feature names as columns for the texts
dense = vectors.todense()
denselist = dense.tolist()
df_end = pd.DataFrame(denselist, columns=feature_names)
df_end['argumentSentenceType']=df_final1['argumentSentenceType']

yoriginal=df_end.argumentSentenceType
Xoriginal=df_end[feature_names]


In [26]:
from numpy import mean
from numpy import isnan
from numpy import asarray
from numpy import polyfit
from scipy.stats import pearsonr
from matplotlib import pyplot
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# create the dataset
def get_dataset(X,y,n_samples=100):
    #X, y = make_classification(n_samples=n_samples, n_features=20, n_informative=15, n_redundant=5, random_state=1)
    return X.sample(n_samples), y.sample(n_samples)

# get a list of models to evaluate
def get_models():
    models = list()
    models.append(LogisticRegression()) #
    #models.append(RidgeClassifier())
    #models.append(SGDClassifier())
    #models.append(PassiveAggressiveClassifier())
    #models.append(KNeighborsClassifier()) #
    #models.append(DecisionTreeClassifier()) #
    #models.append(LinearSVC())
    #models.append(SVC()) #
    #models.append(GaussianNB())
    #models.append(AdaBoostClassifier())
    #models.append(BaggingClassifier())
    #models.append(RandomForestClassifier()) #
    #models.append(ExtraTreesClassifier()) #
    #models.append(GaussianProcessClassifier())
    #models.append(GradientBoostingClassifier()) #
    #models.append(LinearDiscriminantAnalysis())
    #models.append(QuadraticDiscriminantAnalysis())
    return models

def evaluate_model_LOOCV(X, y, model): 
    # evaluate the model
    cv = LeaveOneOut()
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('LOOCV accuracy=%.3f (%.3f,%.3f)' % (mean(scores), scores.min(), scores.max()))
    # return scores    
    return float("{:.4f}".format(mean(scores))), float("{:.4f}".format(scores.min())), float("{:.4f}".format(scores.max()))

# evaluate the model using a given test condition
def evaluate_model_CV(X, y, num_folds, model):        
    # evaluate the model
    cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    # return scores
    return float("{:.4f}".format(mean(scores))), float("{:.4f}".format(scores.min())), float("{:.4f}".format(scores.max()))

def getBestModelCV(Xoriginal,yoriginal):
    # get the list of models to consider
    models = get_models()
    # evaluate each model
    X_size=Xoriginal.shape[0]
    n_samples_list = list()
    n_samples_list.append(50)
    #n_samples_list.append(int(X_size*0.1))
    #n_samples_list.append(int(X_size*0.2))
    #n_samples_list.append(int(X_size*0.3))
    #n_samples_list.append(int(X_size*0.4))
    #n_samples_list.append(int(X_size*0.5))    
    
    out = list()
    
    iter = 1
    for n_samples in n_samples_list:        
        # collect results
        all_model_out = list()
        ideal_results_all_models, cv_results_all_models = list(), list()
        # get the dataset
        X, y = get_dataset(Xoriginal,yoriginal,n_samples)
        print("(X,y) shape",X.shape,y.shape)
        for model in models: 
            print("Evaluating Model:",type(model).__name__)
            #Evaluate Ideal case
            ideal_mean,ideal_min,ideal_max = evaluate_model_LOOCV(X,y, model)            
            #Evaluate cross validation
            cv_results_current_model = list()
            current_model_cv_out = list()
            num_folds = range(2,11) 
            for k in num_folds:
                cv_mean,cv_min,cv_max = evaluate_model_CV(X,y,k,model)
                # store results
                cv_results_current_model.append(cv_mean)                
                print('> fold=%d, accuracy=%.3f (%.3f,%.3f)' % (k, cv_mean,cv_min,cv_max))
                current_model_cv_out.append({'fold':k,'meanAccuracy':cv_mean,'minAccuracy':cv_min,'maxAccuracy':cv_max})
              
            # check for invalid results
            if isnan(mean(cv_results_current_model)) or isnan(ideal_mean):
                continue
            
            ideal_results_all_models.append(ideal_mean)
            cv_results_all_models.append(mean(cv_results_current_model))
            # summarize progress
            #print('>%s: ideal=%.3f, cv=%.3f' % (type(model).__name__, ideal_mean, cv_mean))
            all_model_out.append({'modelName':type(model).__name__,'meanLOOCV':ideal_mean,'mean10FoldCV':float("{:.4f}".format(mean(cv_results_current_model))),"foldWiseResult":current_model_cv_out})
        #print('Mean LOOCV =%.3f, Mean 10-fold CV =%.3f' % (mean(ideal_results),mean(cv_results)))   
        out.append({'iterationNumber':iter,'noOfSamples':n_samples,'meanLOOCVAllModels':float("{:.4f}".format(mean(ideal_results_all_models))),'mean10FoldCVAllModels':float("{:.4f}".format(mean(cv_results_all_models))),'mlModelResultList':all_model_out})
        #out.append({'iterationNumber':iter,'noOfSamples':n_samples,'meanLOOCVAllModels':mean(ideal_results),'mean10FoldCVAllModels':mean(cv_results)})
        iter = iter + 1 
    
    return out

In [27]:
out = getBestModelCV(X,y)
#print(out)

(X,y) shape (50, 4891) (50,)
Evaluating Model: LogisticRegression
LOOCV accuracy=0.800 (0.000,1.000)




> fold=2, accuracy=0.800 (0.800,0.800)




> fold=3, accuracy=0.800 (0.765,0.824)




> fold=4, accuracy=0.801 (0.769,0.833)




> fold=5, accuracy=0.800 (0.800,0.800)




> fold=6, accuracy=0.801 (0.750,0.875)




> fold=7, accuracy=0.801 (0.714,0.857)




> fold=8, accuracy=0.804 (0.714,0.833)




> fold=9, accuracy=0.800 (0.667,0.833)




> fold=10, accuracy=0.800 (0.800,0.800)


AttributeError: 'LogisticRegression' object has no attribute 'best_params_'