In [None]:
import regex as re
from glob import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
prob_ad_dir = '../train/transcription/cd/*'
controls_dir = '../train/transcription/cc/*'

In [None]:
def extract_data(file_name):
    par = {}
    par['id'] = file_name.split('/')[-1].split('.cha')[0]
    f = iter(open(file_name))
    l = next(f)
    speech = []
    try:
        curr_speech = ''
        while (True):
            if l.startswith('*PAR:') or l.startswith('*INV'):
                curr_speech = l
            elif len(curr_speech) != 0 and not(l.startswith('%') or l.startswith('*')):
                curr_speech += l
            elif len(curr_speech) > 0:
                speech.append(curr_speech)
                curr_speech = ''
            l = next(f)
    except StopIteration:
        pass

    clean_par_speech = []
    clean_all_speech = []
    is_par = False
    for s in speech:
        
        def _clean(s):
            s = re.sub('\x15\d*_\d*\x15', '', s) # remove time block 
            s = re.sub('\[.*\]', '', s) # remove other speech artifacts [.*]
            s = s.strip()
            s = re.sub('\t|\n|<|>', '', s) # remove tab, new lines,ampersand
            return s
        
        if s.startswith('*PAR:'):
            is_par = True
        elif s.startswith('*INV:'):
            is_par = False
            s = re.sub('\*INV:\t', '', s) # remove prefix
        if is_par:
            s = re.sub('\*PAR:\t', '', s) # remove prefix    
            clean_par_speech.append(_clean(s))
        clean_all_speech.append(_clean(s))
        
    par['speech'] = speech
    par['clean_speech'] = clean_all_speech
    par['clean_par_speech'] = clean_par_speech
    par['joined_all_speech'] = ' '.join(clean_all_speech)
    par['joined_all_par_speech'] = ' '.join(clean_par_speech)
    
    return par

In [None]:
def parse_train_data():
    return _parse_data('../data/train')

def _parse_data(data_dir):
    prob_ad_dir = f'{data_dir}/transcription/cd/*'
    controls_dir = f'{data_dir}/transcription/cc/*'
    
    prob_ad = [extract_data(fn) for fn in glob(prob_ad_dir)]
    controls = [extract_data(fn) for fn in glob(controls_dir)]
    controls_df = pd.DataFrame(controls)
    prob_ad_df = pd.DataFrame(prob_ad)
    controls_df['ad'] = 0
    prob_ad_df['ad'] = 1
    df = pd.concat([controls_df, prob_ad_df]).sample(frac=1).reset_index(drop=True)
    return df

In [None]:
train_df = parse_train_data()

In [None]:
random_state = 115

In [None]:
def randomforst_models(text: pd.Series, labels: list, shuffle=True):
    ## AD Classification Pred
    
    # sklearn pipeline
    param_space = {
        'vec__max_features': [100, 500, 1000, 2000, 10000],
        'vec__stop_words': ['english', None],
        'vec__analyzer': ['word', 'char'],
        'vec__max_df': [0.5, 0.75, 1.0],
        'vec__sublinear_tf': [True, False]    
     
    }    
    param_space['clf__n_estimators'] = [10]
    param_space['clf__max_depth'] = [5, 10, 15]
    param_space['clf__min_samples_split']= [2, 5]
    param_space['clf__min_samples_leaf']= [1, 2, 4]
    param_space ['clf__bootstrap']= [True, False]

    clf_pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('clf', RandomForestClassifier())
    ])
    train_features, test_features, train_labels, test_labels = train_test_split(text, labels, random_state=random_state, test_size=0.2,shuffle=shuffle)
    search = GridSearchCV(clf_pipe, param_space, cv=10, n_jobs=6)
    search.fit(train_features, train_labels)
    clf_pipe.set_params(**search.best_params_)
    print(search.best_params_)
    clf_pipe.fit(train_features, train_labels)
  
    return clf_pipe, test_features,test_labels,train_features,train_labels

In [None]:
clf_ran_par,test_features,test_labels,train_features,train_labels = randomforst_models(train_df.joined_all_par_speech, train_df.ad)

In [None]:
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# define Bootstrap Sampling method
def bootstrap_sampling(text: pd.Series, labels: list, shuffle=True,n_iterations=5):
    stats = []
    ## AD Classification Pred
    # sklearn pipeline
    param_space = {
        'vec__max_features': [100, 500, 1000, 2000, 10000],
        'vec__stop_words': ['english', None],
        'vec__analyzer': ['word', 'char'],
        'vec__max_df': [0.5, 0.75, 1.0],
        'vec__sublinear_tf': [True, False]    
     
    }    
    param_space['clf__n_estimators'] = [10]
    param_space['clf__max_depth'] = [5, 10, 15]
    param_space['clf__min_samples_split']= [2, 5]
    param_space['clf__min_samples_leaf']= [1, 2, 4]
    param_space ['clf__bootstrap']= [True, False]

    clf_pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('clf', RandomForestClassifier())
    ])

    for i in range(n_iterations):
        # Sampling with putback from the training set to generate a new training set
        # Extracting features and labels
        train_features, test_features, train_labels, test_labels = train_test_split(text, labels, test_size=0.2,shuffle=shuffle)
        search = GridSearchCV(clf_pipe, param_space, cv=10, n_jobs=6)
        search.fit(train_features, train_labels)
        clf_pipe.set_params(**search.best_params_)
        print(search.best_params_)
        clf_pipe.fit(train_features, train_labels)

        # Predictions on the test set
        predictions = clf_pipe.predict(test_features)

        # Calculation accuracy
        accuracy = accuracy_score(test_labels, predictions)
        stats.append(accuracy)
    return stats

# call Bootstrap Sampling to get the performance evaluation results
bootstrap_stats = bootstrap_sampling(train_df.joined_all_par_speech, train_df.ad)
mean_accuracy = np.mean(bootstrap_stats)
std_accuracy = np.std(bootstrap_stats)

print("Bootstrap Sampling method:")
print(f"Average accuracy:{mean_accuracy:.3f}")
print(f"Standard deviation of accuracy:{std_accuracy:.3f}")

confidence_interval = 0.95 
alpha = (1 - confidence_interval) / 2

# Sort the accuracy scores obtained from Bootstrap Sampling
sorted_bootstrap_stats = sorted(bootstrap_stats)

# Calculate the lower and upper percentiles to get the confidence interval
lower_percentile_idx = int(len(sorted_bootstrap_stats) * alpha)
upper_percentile_idx = int(len(sorted_bootstrap_stats) * (1 - alpha))

lower_bound = sorted_bootstrap_stats[lower_percentile_idx]
upper_bound = sorted_bootstrap_stats[upper_percentile_idx]

print(f"{confidence_interval * 100:.1f}% Confidence Interval:")
print(f"Lower Bound: {lower_bound:.3f}")
print(f"Upper Bound: {upper_bound:.3f}")

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# define K-Fold Cross Validation method
def kfold_cross_validation(clf, features, labels, n_splits=10):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(clf, features, labels, cv=kfold)
    return scores

# call K-Fold Cross Validation method to get the performance evaluation results
kfold_scores = kfold_cross_validation(clf_ran_par, train_features, train_labels)
mean_accuracy_kfold = np.mean(kfold_scores)
std_accuracy_kfold = np.std(kfold_scores)
print("K-Fold Cross Validation method:")
print(f"Average accuracy:{mean_accuracy_kfold:.3f}")
print(f"Standard deviation of accuracy:{std_accuracy_kfold:.3f}")

In [None]:

# LOSOCV 
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score
import numpy as np

# define Leave One Subject Out Cross Validation method
def leave_one_subject_out_cross_validation(clf, features, labels, subject_ids):
    losocv = LeaveOneGroupOut()
    scores = cross_val_score(clf, features, labels, groups=subject_ids, cv=losocv)  
    return scores

# call Leave One Subject Out Cross Validation method to get the performance evaluation results
loso_cv_scores = leave_one_subject_out_cross_validation(clf_ran_par, train_df.joined_all_par_speech, train_df.ad, train_df.id)
mean_accuracy_loso_cv = np.mean(loso_cv_scores)
std_accuracy_loso_cv = np.std(loso_cv_scores)
print("Leave One Subject Out Cross Validation method:")
print(f"Average accuracy:{mean_accuracy_loso_cv:.3f}")
print(f"Standard deviation of accuracy:{std_accuracy_loso_cv:.3f}")