In [20]:
# script created by LLM using symbolic composition rules to generate compatible data
import random
import csv

# List of keywords in the C language
keywords = ['auto', 'double', 'int', 'struct', 'break', 'else', 'long', 'switch', 'case', 'enum', 'register', 'typedef',
            'char', 'extern', 'return', 'union', 'const', 'float', 'short', 'unsigned', 'continue', 'for', 'signed',
            'void', 'default', 'goto', 'sizeof', 'volatile', 'do', 'if', 'static', 'while']

# List of data types for variables
data_types = ['char', 'int', 'float', 'double', 'void']

# List of labels for comments
comment_labels = ['Useful', 'Not Useful']

# Function to generate a random valid identifier
def generate_identifier():
    first_char = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_')
    rest_chars = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_', k=random.randint(0, 10)))
    return first_char + rest_chars

# Function to generate a random valid line of code
def generate_line_of_code():
    keyword_or_data_type = random.choice(keywords + data_types)
    identifier = generate_identifier()
    value = random.choice(['', f' = {random.randint(0, 100)}'])
    return f'{keyword_or_data_type} {identifier}{value};'

# Function to generate a random comment
def generate_comment():
    level_of_detail = random.choice(['', ' // ' + ' '.join(generate_identifier() for _ in range(random.randint(1, 5)))])
    return random.choice(['', '/* ' + generate_identifier() + ' */']) + level_of_detail

# Function to generate a useful comment for a given line of code
def generate_useful_comment(line_of_code):
    purpose_keywords = ['Declaration', 'Initialization', 'Calculation', 'Function', 'Definition', 'Usage', 'Explanation']
    variable_keywords = ['Variable', 'Value', 'Data', 'Result', 'Parameter']

    purpose = random.choice(purpose_keywords)
    variable = random.choice(variable_keywords)

    return f'// {purpose} of {variable} in the line of code:\n// {line_of_code}'

# Function to generate a random label for a comment
def generate_comment_label():
    return random.choice(comment_labels)

# Generate 5000 lines of code, comments, and labels
data = []
for _ in range(5000):
    line_of_code = generate_line_of_code()
    comment = generate_comment()
    label = generate_comment_label()

    # Ensure the comment is useful if labeled as Useful
    if label == 'Useful':
        comment = generate_useful_comment(line_of_code)

    data.append((line_of_code, comment, label))

# Function to write data to a CSV file
def write_to_csv(file_path, data):
    with open(file_path, mode='w', newline='') as csv_file:
        fieldnames = ['Line of Code', 'Comment', 'Class']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow({'Line of Code': row[0], 'Comment': row[1], 'Class': row[2]})

# Specify the file path
csv_file_path = 'test.csv'

# Write data to the CSV file
write_to_csv(csv_file_path, data)

print(f'Data has been generated and saved to {csv_file_path}')


Data has been generated and saved to test.csv


In [21]:
import pandas as pd

data = pd.read_csv('FIRE2023_IRSE_training_Code_Comment_Seed_Data.csv')
data.head(5)

Unnamed: 0,Comments,Surrounding Code Context,Class
0,/*test 529*/,-10. int res = 0;\n-9. CURL *curl = NULL;\...,Not Useful
1,/*test 525*/,"-2. fprintf(stderr, ""Usage: lib529 [url] [...",Not Useful
2,/*done*/,"-10. multi_add_handle(m, curl);\n-9. for(;...",Not Useful
3,/*test 529*/,-10. int res = 0;\n-9. CURL *curl = NULL;\...,Not Useful
4,/*test 525*/,"-2. fprintf(stderr, ""Usage: lib529 [url] [...",Not Useful


In [22]:
# experiment 2: Try self-training with ChatGPT generated data
chatgpt_data = pd.read_csv('chatgpt_generated_data.csv')
chatgpt_data.head(5)

Unnamed: 0,Line of Code,Comment,Class
0,int randomNumber = rand() % 100 + 1;,"""Generate a random number between 1 and 100.""",Useful
1,int length = strlen(inputString);,"""Get the length of the input string.""",Useful
2,for (int i = 0; i < 10; i++) {,"""Loop from 0 to 9.""",Useful
3,"printf(%d "", i);""","""Print the current value of i.""",Useful
4,"char name[20] = ""John Smith"";","""Declare and initialize a character array to ...",Useful


In [23]:
# experiment 3: Try self-training with ChatGPT symbolically generated data
symbolic_data = pd.read_csv('llm_symbolic_generated_data.csv')
symbolic_data.head(5)

Unnamed: 0,Line of Code,Comment,Class
0,do rf;,,Not Useful
1,double eVqWsc4JTj;,/* jPMSXf9g7zS */ // A VQL2 YTCzrq,Not Useful
2,break jeckm = 82;,// gIyPYWK0G6 sH7k8dX8n nb8F,Not Useful
3,static q3GV0wdM = 73;,,Not Useful
4,float hmGmMaom5j;,// Function of Parameter in the line of code:\...,Useful


In [24]:
# clean dataset by removing extra quotation marks that ChatGPT adds when it generates content
chatgpt_data["Line of Code"] = chatgpt_data["Line of Code"].str.lstrip('"')
chatgpt_data["Line of Code"] = chatgpt_data["Line of Code"].str.rstrip('"')
# rename columns to match original data columns
chatgpt_data = chatgpt_data.rename(columns={'Comment': 'Comments', 'Line of Code': 'Surrounding Code Context', 'Class': 'Class'})
# change column order to match original dataset
chatgpt_data = chatgpt_data[['Comments', 'Surrounding Code Context', 'Class']]
chatgpt_data['Comments'] = chatgpt_data['Comments'].map(lambda x: x.strip().lstrip('"').rstrip('"'))
chatgpt_data['Surrounding Code Context'] = chatgpt_data['Surrounding Code Context'].map(lambda x: x.strip())
chatgpt_data['Class'] = chatgpt_data['Class'].map(lambda x: x.strip())
chatgpt_data.head(5)

# do the same for the symbolic generated data
symbolic_data["Line of Code"] = symbolic_data["Line of Code"].str.lstrip('"')
symbolic_data["Line of Code"] = symbolic_data["Line of Code"].str.rstrip('"')
# rename columns to match original data columns
symbolic_data = symbolic_data.rename(columns={'Comment': 'Comments', 'Line of Code': 'Surrounding Code Context', 'Class': 'Class'})
# change column order to match original dataset
symbolic_data = symbolic_data[['Comments', 'Surrounding Code Context', 'Class']]
symbolic_data['Comments'] = symbolic_data['Comments'].map(lambda x: str(x).strip().lstrip('"').rstrip('"'))
symbolic_data['Surrounding Code Context'] = symbolic_data['Surrounding Code Context'].map(lambda x: x.strip())
symbolic_data['Class'] = symbolic_data['Class'].map(lambda x: x.strip())
symbolic_data.head(5)

Unnamed: 0,Comments,Surrounding Code Context,Class
0,,do rf;,Not Useful
1,/* jPMSXf9g7zS */ // A VQL2 YTCzrq,double eVqWsc4JTj;,Not Useful
2,// gIyPYWK0G6 sH7k8dX8n nb8F,break jeckm = 82;,Not Useful
3,,static q3GV0wdM = 73;,Not Useful
4,// Function of Parameter in the line of code:\...,float hmGmMaom5j;,Useful


In [25]:
# now merge new dataset with the old
full_data = data.append(chatgpt_data)
full_data = full_data.append(symbolic_data)
print(full_data.shape)
full_data.head(5)

(16872, 3)


  full_data = data.append(chatgpt_data)
  full_data = full_data.append(symbolic_data)


Unnamed: 0,Comments,Surrounding Code Context,Class
0,/*test 529*/,-10. int res = 0;\n-9. CURL *curl = NULL;\...,Not Useful
1,/*test 525*/,"-2. fprintf(stderr, ""Usage: lib529 [url] [...",Not Useful
2,/*done*/,"-10. multi_add_handle(m, curl);\n-9. for(;...",Not Useful
3,/*test 529*/,-10. int res = 0;\n-9. CURL *curl = NULL;\...,Not Useful
4,/*test 525*/,"-2. fprintf(stderr, ""Usage: lib529 [url] [...",Not Useful


In [26]:
# now re-apply data transformation on entire dataset and prepare for training
full_data.loc[:,["Class"]] = pd.get_dummies(full_data.Class,drop_first=True).values

full_data["Input"] = full_data["Comments"] + " " + full_data["Surrounding Code Context"]

  full_data.loc[:,["Class"]] = pd.get_dummies(full_data.Class,drop_first=True).values


In [7]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [27]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")

# Encode our code into the vector space
code_emb = model.encode(full_data["Input"].astype(str).values.tolist(), convert_to_tensor=True)

In [9]:
# use SMOTE to balance dataset
# and generate enough "Not Useful" comment data
!pip install imbalanced-learn



In [28]:
from collections import Counter
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
Y = full_data['Class']
Y=Y.astype('int')
X = code_emb.tolist()
counter = Counter(Y)
print(counter)
X, y = oversample.fit_resample(X, Y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({1: 10047, 0: 6825})
Counter({0: 10047, 1: 10047})


In [33]:
# classification report on CV with VotingClassifier
import numpy as np
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, make_scorer

# prepare candidate classifiers
rf_model = RandomForestClassifier(random_state = 42)
nn_model = MLPClassifier(hidden_layer_sizes=(20,10),random_state=42)
svc_model = LinearSVC(random_state = 42)

# define model
model =  VotingClassifier(estimators=[('rf', rf_model), ('nn', nn_model), ('svc', svc_model)], voting='hard')

classification_report_list = []
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
for train_index, test_index in cv.split(X, y):
    x_train, x_test = np.array(X)[train_index], np.array(X)[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("*** K-Fold Split Results ***")
    classification_report_results = classification_report(y_test, y_pred, output_dict=True)
    print(classification_report_results)
    classification_report_list.append(classification_report_results)



*** K-Fold Split Results ***
{'0': {'precision': 0.9331941544885177, 'recall': 0.8895522388059701, 'f1-score': 0.9108507386653082, 'support': 1005}, '1': {'precision': 0.8944866920152091, 'recall': 0.936318407960199, 'f1-score': 0.9149246475449684, 'support': 1005}, 'accuracy': 0.9129353233830846, 'macro avg': {'precision': 0.9138404232518634, 'recall': 0.9129353233830846, 'f1-score': 0.9128876931051383, 'support': 2010}, 'weighted avg': {'precision': 0.9138404232518634, 'recall': 0.9129353233830846, 'f1-score': 0.9128876931051384, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.911, 'recall': 0.9064676616915422, 'f1-score': 0.9087281795511223, 'support': 1005}, '1': {'precision': 0.906930693069307, 'recall': 0.9114427860696518, 'f1-score': 0.909181141439206, 'support': 1005}, 'accuracy': 0.908955223880597, 'macro avg': {'precision': 0.9089653465346534, 'recall': 0.908955223880597, 'f1-score': 0.9089546604951642, 'support': 2010}, 'weighted avg': {'precision': 0.9089653465346536, 'recall': 0.908955223880597, 'f1-score': 0.9089546604951642, 'support': 2010}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9107142857142857, 'recall': 0.9134328358208955, 'f1-score': 0.9120715350223547, 'support': 1005}, '1': {'precision': 0.9131736526946108, 'recall': 0.9104477611940298, 'f1-score': 0.9118086696562033, 'support': 1005}, 'accuracy': 0.9119402985074627, 'macro avg': {'precision': 0.9119439692044482, 'recall': 0.9119402985074627, 'f1-score': 0.911940102339279, 'support': 2010}, 'weighted avg': {'precision': 0.911



*** K-Fold Split Results ***
{'0': {'precision': 0.9060077519379846, 'recall': 0.9303482587064676, 'f1-score': 0.9180166912125675, 'support': 1005}, '1': {'precision': 0.9284253578732107, 'recall': 0.9034825870646767, 'f1-score': 0.9157841654059506, 'support': 1005}, 'accuracy': 0.9169154228855722, 'macro avg': {'precision': 0.9172165549055976, 'recall': 0.9169154228855722, 'f1-score': 0.916900428309259, 'support': 2010}, 'weighted avg': {'precision': 0.9172165549055976, 'recall': 0.9169154228855722, 'f1-score': 0.9169004283092591, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9241025641025641, 'recall': 0.8965174129353234, 'f1-score': 0.9101010101010101, 'support': 1005}, '1': {'precision': 0.8994197292069632, 'recall': 0.9262948207171314, 'f1-score': 0.9126594700686946, 'support': 1004}, 'accuracy': 0.9113987058237929, 'macro avg': {'precision': 0.9117611466547637, 'recall': 0.9114061168262274, 'f1-score': 0.9113802400848523, 'support': 2009}, 'weighted avg': {'precision': 0.9117672897196953, 'recall': 0.9113987058237929, 'f1-score': 0.9113796033352337, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9192734611503531, 'recall': 0.9064676616915422, 'f1-score': 0.9128256513026051, 'support': 1005}, '1': {'precision': 0.9076620825147348, 'recall': 0.9203187250996016, 'f1-score': 0.9139465875370919, 'support': 1004}, 'accuracy': 0.9133897461423593, 'macro avg': {'precision': 0.9134677718325439, 'recall': 0.9133931933955719, 'f1-score': 0.9133861194198485, 'support': 2009}, 'weighted avg': {'precision': 0.9134706616729211, 'recall': 0.9133897461423593, 'f1-score': 0.9133858404411938, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9203539823008849, 'recall': 0.9313432835820895, 'f1-score': 0.9258160237388724, 'support': 1005}, '1': {'precision': 0.9304435483870968, 'recall': 0.9193227091633466, 'f1-score': 0.9248496993987975, 'support': 1004}, 'accuracy': 0.925335988053758, 'macro avg': {'precision': 0.9253987653439908, 'recall': 0.925332996372718, 'f1-score': 0.925332861568835, 'support': 2009}, 'weighted avg': {'precision': 0.9253962542523815, 'recall': 0.925335988053758, 'f1-score': 0.925333102067675, 'support': 2009}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9132602193419741, 'recall': 0.9123505976095617, 'f1-score': 0.9128051818634778, 'support': 1004}, '1': {'precision': 0.9125248508946322, 'recall': 0.9134328358208955, 'f1-score': 0.9129786176031824, 'support': 1005}, 'accuracy': 0.9128919860627178, 'macro avg': {'precision': 0.9128925351183032, 'recall': 0.9128917167152286, 'f1-score': 0.9128918997333302, 'support': 2009}, 'weighted avg': {'pre



*** K-Fold Split Results ***
{'0': {'precision': 0.9156626506024096, 'recall': 0.9083665338645418, 'f1-score': 0.912, 'support': 1004}, '1': {'precision': 0.9091806515301086, 'recall': 0.9164179104477612, 'f1-score': 0.912784935579782, 'support': 1005}, 'accuracy': 0.9123942259830762, 'macro avg': {'precision': 0.912421651066259, 'recall': 0.9123922221561516, 'f1-score': 0.912392467789891, 'support': 2009}, 'weighted avg': {'precision': 0.9124200378260718, 'recall': 0.9123942259830762, 'f1-score': 0.9123926631446894, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9219712525667351, 'recall': 0.8944223107569721, 'f1-score': 0.9079878665318504, 'support': 1004}, '1': {'precision': 0.8975845410628019, 'recall': 0.9243781094527364, 'f1-score': 0.9107843137254902, 'support': 1005}, 'accuracy': 0.9094076655052264, 'macro avg': {'precision': 0.9097778968147685, 'recall': 0.9094002101048542, 'f1-score': 0.9093860901286703, 'support': 2009}, 'weighted avg': {'precision': 0.9097718274490383, 'recall': 0.9094076655052264, 'f1-score': 0.9093867861085592, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9221435793731041, 'recall': 0.9074626865671642, 'f1-score': 0.9147442326980944, 'support': 1005}, '1': {'precision': 0.9089128305582762, 'recall': 0.9233830845771144, 'f1-score': 0.9160908193484699, 'support': 1005}, 'accuracy': 0.9154228855721394, 'macro avg': {'precision': 0.9155282049656901, 'recall': 0.9154228855721394, 'f1-score': 0.9154175260232822, 'support': 2010}, 'weighted avg': {'precision': 0.9155282049656901, 'recall': 0.9154228855721394, 'f1-score': 0.9154175260232822, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9285714285714286, 'recall': 0.8925373134328358, 'f1-score': 0.9101978691019788, 'support': 1005}, '1': {'precision': 0.896551724137931, 'recall': 0.9313432835820895, 'f1-score': 0.9136163982430454, 'support': 1005}, 'accuracy': 0.9119402985074627, 'macro avg': {'precision': 0.9125615763546798, 'recall': 0.9119402985074627, 'f1-score': 0.9119071336725121, 'support': 2010}, 'weighted avg': {'precision': 0.9125615763546798, 'recall': 0.9119402985074627, 'f1-score': 0.9119071336725121, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9189463019250254, 'recall': 0.9024875621890547, 'f1-score': 0.9106425702811245, 'support': 1005}, '1': {'precision': 0.9042033235581622, 'recall': 0.9203980099502488, 'f1-score': 0.9122287968441815, 'support': 1005}, 'accuracy': 0.9114427860696518, 'macro avg': {'precision': 0.9115748127415938, 'recall': 0.9114427860696517, 'f1-score': 0.9114356835626529, 'support': 2010}, 'weighted avg': {'precision': 0.9115748127415938, 'recall': 0.9114427860696518, 'f1-score': 0.9114356835626529, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9067713444553483, 'recall': 0.9194029850746268, 'f1-score': 0.9130434782608694, 'support': 1005}, '1': {'precision': 0.9182643794147326, 'recall': 0.9054726368159204, 'f1-score': 0.9118236472945891, 'support': 1005}, 'accuracy': 0.9124378109452737, 'macro avg': {'precision': 0.9125178619350405, 'recall': 0.9124378109452735, 'f1-score': 0.9124335627777292, 'support': 2010}, 'weighted avg': {'precision': 0.9125178619350405, 'recall': 0.9124378109452737, 'f1-score': 0.9124335627777294, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9226856561546287, 'recall': 0.9024875621890547, 'f1-score': 0.9124748490945674, 'support': 1005}, '1': {'precision': 0.9044834307992202, 'recall': 0.9243027888446215, 'f1-score': 0.9142857142857141, 'support': 1004}, 'accuracy': 0.9133897461423593, 'macro avg': {'precision': 0.9135845434769245, 'recall': 0.9133951755168381, 'f1-score': 0.9133802816901408, 'support': 2009}, 'weighted avg': {'precision': 0.9135890736474958, 'recall': 0.9133897461423593, 'f1-score': 0.91337983100194, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.917753259779338, 'recall': 0.9104477611940298, 'f1-score': 0.914085914085914, 'support': 1005}, '1': {'precision': 0.9110671936758893, 'recall': 0.9183266932270916, 'f1-score': 0.9146825396825397, 'support': 1004}, 'accuracy': 0.9143852663016426, 'macro avg': {'precision': 0.9144102267276136, 'recall': 0.9143872272105606, 'f1-score': 0.9143842268842268, 'support': 2009}, 'weighted avg': {'precision': 0.9144118907560117, 'recall': 0.9143852663016426, 'f1-score': 0.9143840783960245, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9318413021363174, 'recall': 0.9114427860696518, 'f1-score': 0.9215291750503017, 'support': 1005}, '1': {'precision': 0.9132553606237817, 'recall': 0.9332669322709163, 'f1-score': 0.9231527093596058, 'support': 1004}, 'accuracy': 0.9223494275759084, 'macro avg': {'precision': 0.9225483313800495, 'recall': 0.922354859170284, 'f1-score': 0.9223409422049538, 'support': 2009}, 'weighted avg': {'precision': 0.9225529570499134, 'recall': 0.9223494275759084, 'f1-score': 0.9223405381396703, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9203980099502488, 'recall': 0.9213147410358565, 'f1-score': 0.9208561473369836, 'support': 1004}, '1': {'precision': 0.9213147410358565, 'recall': 0.9203980099502488, 'f1-score': 0.9208561473369836, 'support': 1005}, 'accuracy': 0.9208561473369836, 'macro avg': {'precision': 0.9208563754930527, 'recall': 0.9208563754930527, 'f1-score': 0.9208561473369836, 'support': 2009}, 'weighted avg': {'precision': 0.9208566036491218, 'recall': 0.9208561473369836, 'f1-score': 0.9208561473369836, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9150197628458498, 'recall': 0.9223107569721115, 'f1-score': 0.9186507936507937, 'support': 1004}, '1': {'precision': 0.921765295887663, 'recall': 0.9144278606965174, 'f1-score': 0.9180819180819181, 'support': 1005}, 'accuracy': 0.9183673469387755, 'macro avg': {'precision': 0.9183925293667564, 'recall': 0.9183693088343144, 'f1-score': 0.918366355866356, 'support': 2009}, 'weighted avg': {'precision': 0.9183942081952885, 'recall': 0.9183673469387755, 'f1-score': 0.9183662142845817, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.916241062308478, 'recall': 0.8934262948207171, 'f1-score': 0.9046898638426626, 'support': 1004}, '1': {'precision': 0.8961165048543689, 'recall': 0.918407960199005, 'f1-score': 0.9071253071253071, 'support': 1005}, 'accuracy': 0.9059233449477352, 'macro avg': {'precision': 0.9061787835814235, 'recall': 0.905917127509861, 'f1-score': 0.9059075854839849, 'support': 2009}, 'weighted avg': {'precision': 0.9061737749807629, 'recall': 0.9059233449477352, 'f1-score': 0.905908191617206, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9167502507522568, 'recall': 0.909452736318408, 'f1-score': 0.9130869130869131, 'support': 1005}, '1': {'precision': 0.910167818361303, 'recall': 0.9174129353233831, 'f1-score': 0.9137760158572844, 'support': 1005}, 'accuracy': 0.9134328358208955, 'macro avg': {'precision': 0.9134590345567799, 'recall': 0.9134328358208955, 'f1-score': 0.9134314644720988, 'support': 2010}, 'weighted avg': {'precision': 0.9134590345567798, 'recall': 0.9134328358208955, 'f1-score': 0.9134314644720988, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9122983870967742, 'recall': 0.900497512437811, 'f1-score': 0.9063595393089634, 'support': 1005}, '1': {'precision': 0.9017681728880157, 'recall': 0.9134328358208955, 'f1-score': 0.907563025210084, 'support': 1005}, 'accuracy': 0.9069651741293532, 'macro avg': {'precision': 0.907033279992395, 'recall': 0.9069651741293532, 'f1-score': 0.9069612822595237, 'support': 2010}, 'weighted avg': {'precision': 0.9070332799923949, 'recall': 0.9069651741293532, 'f1-score': 0.9069612822595238, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9241164241164241, 'recall': 0.8845771144278607, 'f1-score': 0.9039145907473309, 'support': 1005}, '1': {'precision': 0.8893129770992366, 'recall': 0.9273631840796019, 'f1-score': 0.9079396005845104, 'support': 1005}, 'accuracy': 0.9059701492537313, 'macro avg': {'precision': 0.9067147006078304, 'recall': 0.9059701492537313, 'f1-score': 0.9059270956659207, 'support': 2010}, 'weighted avg': {'precision': 0.9067147006078303, 'recall': 0.9059701492537313, 'f1-score': 0.9059270956659207, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9193548387096774, 'recall': 0.9074626865671642, 'f1-score': 0.9133700550826239, 'support': 1005}, '1': {'precision': 0.9086444007858546, 'recall': 0.9203980099502488, 'f1-score': 0.9144834404349975, 'support': 1005}, 'accuracy': 0.9139303482587064, 'macro avg': {'precision': 0.913999619747766, 'recall': 0.9139303482587064, 'f1-score': 0.9139267477588107, 'support': 2010}, 'weighted avg': {'precision': 0.913999619747766, 'recall': 0.9139303482587064, 'f1-score': 0.9139267477588107, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9182839632277835, 'recall': 0.8945273631840795, 'f1-score': 0.90625, 'support': 1005}, '1': {'precision': 0.8970873786407767, 'recall': 0.9203187250996016, 'f1-score': 0.9085545722713865, 'support': 1004}, 'accuracy': 0.90741662518666, 'macro avg': {'precision': 0.9076856709342801, 'recall': 0.9074230441418405, 'f1-score': 0.9074022861356932, 'support': 2009}, 'weighted avg': {'precision': 0.9076909463410963, 'recall': 0.90741662518666, 'f1-score': 0.9074017125736547, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9195630585898709, 'recall': 0.9213930348258706, 'f1-score': 0.9204771371769384, 'support': 1005}, '1': {'precision': 0.9211576846307385, 'recall': 0.9193227091633466, 'f1-score': 0.9202392821535393, 'support': 1004}, 'accuracy': 0.920358387257342, 'macro avg': {'precision': 0.9203603716103047, 'recall': 0.9203578719946086, 'f1-score': 0.9203582096652388, 'support': 2009}, 'weighted avg': {'precision': 0.9203599747397122, 'recall': 0.920358387257342, 'f1-score': 0.9203582688626065, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9223107569721115, 'recall': 0.9213930348258706, 'f1-score': 0.9218516674962668, 'support': 1005}, '1': {'precision': 0.9213930348258706, 'recall': 0.9223107569721115, 'f1-score': 0.9218516674962668, 'support': 1004}, 'accuracy': 0.9218516674962668, 'macro avg': {'precision': 0.9218518958989911, 'recall': 0.9218518958989911, 'f1-score': 0.9218516674962668, 'support': 2009}, 'weighted avg': {'precision': 0.9218521243017154, 'recall': 0.9218516674962668, 'f1-score': 0.9218516674962668, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9201596806387226, 'recall': 0.9183266932270916, 'f1-score': 0.9192422731804586, 'support': 1004}, '1': {'precision': 0.9185700099304865, 'recall': 0.9203980099502488, 'f1-score': 0.9194831013916502, 'support': 1005}, 'accuracy': 0.9193628670980587, 'macro avg': {'precision': 0.9193648452846046, 'recall': 0.9193623515886702, 'f1-score': 0.9193626872860544, 'support': 2009}, 'weighted avg': {'precision': 0.9193644496472954, 'recall': 0.9193628670980587, 'f1-score': 0.9193627472233892, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9152542372881356, 'recall': 0.9143426294820717, 'f1-score': 0.914798206278027, 'support': 1004}, '1': {'precision': 0.9145129224652088, 'recall': 0.9154228855721394, 'f1-score': 0.9149676777722527, 'support': 1005}, 'accuracy': 0.9148830263812843, 'macro avg': {'precision': 0.9148835798766721, 'recall': 0.9148827575271055, 'f1-score': 0.9148829420251399, 'support': 2009}, 'weighted avg': {'precision': 0.9148833953782096, 'recall': 0.9148830263812843, 'f1-score': 0.9148829842032121, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9261133603238867, 'recall': 0.9113545816733067, 'f1-score': 0.9186746987951808, 'support': 1004}, '1': {'precision': 0.9128305582761999, 'recall': 0.9273631840796019, 'f1-score': 0.9200394866732478, 'support': 1005}, 'accuracy': 0.9193628670980587, 'macro avg': {'precision': 0.9194719593000433, 'recall': 0.9193588828764543, 'f1-score': 0.9193570927342143, 'support': 2009}, 'weighted avg': {'precision': 0.9194686534757406, 'recall': 0.9193628670980587, 'f1-score': 0.9193574324026758, 'support': 2009}}


In [34]:
# calculate average metrics for submission
import numpy
final_dict = {}
final_dict['0-precision'] = []
final_dict['0-recall'] = []
final_dict['0-f1'] = []
final_dict['1-precision'] = []
final_dict['1-recall'] = []
final_dict['1-f1'] = []
final_dict['accuracy'] = []

for dictionary in classification_report_list:
    for key, inner_dict in dictionary.items():
        if key == '0':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['0-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['0-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['0-f1'].append(inner_dict['f1-score'])
        if key == '1':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['1-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['1-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['1-f1'].append(inner_dict['f1-score'])
        if key == "accuracy":
            final_dict['accuracy'].append(dictionary['accuracy'])


for key in final_dict.keys():
    final_dict[key] = numpy.average(final_dict[key])

print("******** VC FINAL DICT ********")
print(final_dict)

******** VC FINAL DICT ********
{'0-precision': 0.9190038995807038, '0-recall': 0.9085305543993182, '0-f1': 0.9136714284181721, '1-precision': 0.9097070513899418, '1-recall': 0.9198442383038328, '1-f1': 0.9146848038468979, 'accuracy': 0.9141871198181978}


In [29]:
# classification report on CV with VotingClassifier
import numpy as np
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, make_scorer

# define model
model = MLPClassifier(hidden_layer_sizes=(20,10),random_state=42)

classification_report_list = []
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
for train_index, test_index in cv.split(X, y):
    x_train, x_test = np.array(X)[train_index], np.array(X)[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("*** K-Fold Split Results ***")
    classification_report_results = classification_report(y_test, y_pred, output_dict=True)
    print(classification_report_results)
    classification_report_list.append(classification_report_results)



*** K-Fold Split Results ***
{'0': {'precision': 0.9164149043303121, 'recall': 0.9054726368159204, 'f1-score': 0.9109109109109109, 'support': 1005}, '1': {'precision': 0.9065880039331367, 'recall': 0.9174129353233831, 'f1-score': 0.9119683481701286, 'support': 1005}, 'accuracy': 0.9114427860696518, 'macro avg': {'precision': 0.9115014541317243, 'recall': 0.9114427860696517, 'f1-score': 0.9114396295405198, 'support': 2010}, 'weighted avg': {'precision': 0.9115014541317245, 'recall': 0.9114427860696518, 'f1-score': 0.9114396295405197, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.8903846153846153, 'recall': 0.9213930348258706, 'f1-score': 0.9056234718826406, 'support': 1005}, '1': {'precision': 0.9185567010309278, 'recall': 0.8865671641791045, 'f1-score': 0.9022784810126582, 'support': 1005}, 'accuracy': 0.9039800995024876, 'macro avg': {'precision': 0.9044706582077715, 'recall': 0.9039800995024876, 'f1-score': 0.9039509764476494, 'support': 2010}, 'weighted avg': {'precision': 0.9044706582077716, 'recall': 0.9039800995024876, 'f1-score': 0.9039509764476493, 'support': 2010}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9132149901380671, 'recall': 0.9213930348258706, 'f1-score': 0.9172857850420999, 'support': 1005}, '1': {'precision': 0.9206827309236948, 'recall': 0.9124378109452737, 'f1-score': 0.9165417291354323, 'support': 1005}, 'accuracy': 0.9169154228855722, 'macro avg': {'precision': 0.9169488605308809, 'recall': 0.9169154228855722, 'f1-score': 0.9169137570887661, 'support': 2010}, 'weighted avg': 



*** K-Fold Split Results ***
{'0': {'precision': 0.9024856596558317, 'recall': 0.9393034825870646, 'f1-score': 0.9205265724037054, 'support': 1005}, '1': {'precision': 0.9367219917012448, 'recall': 0.8985074626865671, 'f1-score': 0.9172168613509396, 'support': 1005}, 'accuracy': 0.9189054726368159, 'macro avg': {'precision': 0.9196038256785383, 'recall': 0.9189054726368159, 'f1-score': 0.9188717168773225, 'support': 2010}, 'weighted avg': {'precision': 0.9196038256785383, 'recall': 0.9189054726368159, 'f1-score': 0.9188717168773225, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9150849150849151, 'recall': 0.9114427860696518, 'f1-score': 0.9132602193419741, 'support': 1005}, '1': {'precision': 0.9117063492063492, 'recall': 0.9153386454183267, 'f1-score': 0.9135188866799204, 'support': 1004}, 'accuracy': 0.9133897461423593, 'macro avg': {'precision': 0.9133956321456321, 'recall': 0.9133907157439892, 'f1-score': 0.9133895530109473, 'support': 2009}, 'weighted avg': {'precision': 0.9133964730032426, 'recall': 0.9133897461423593, 'f1-score': 0.91338948863381, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9158415841584159, 'recall': 0.9203980099502488, 'f1-score': 0.9181141439205955, 'support': 1005}, '1': {'precision': 0.91991991991992, 'recall': 0.9153386454183267, 'f1-score': 0.9176235646530205, 'support': 1004}, 'accuracy': 0.9178695868591339, 'macro avg': {'precision': 0.9178807520391679, 'recall': 0.9178683276842877, 'f1-score': 0.9178688542868081, 'support': 2009}, 'weighted avg': {'precision': 0.9178797370228012, 'recall': 0.9178695868591339, 'f1-score': 0.9178689763821957, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9110251450676983, 'recall': 0.9373134328358209, 'f1-score': 0.9239823442864149, 'support': 1005}, '1': {'precision': 0.9353846153846154, 'recall': 0.9083665338645418, 'f1-score': 0.921677614957049, 'support': 1004}, 'accuracy': 0.9228471876555501, 'macro avg': {'precision': 0.9232048802261568, 'recall': 0.9228399833501814, 'f1-score': 0.922829979621732, 'support': 2009}, 'weighted avg': {'precision': 0.9231988176402143, 'recall': 0.9228471876555501, 'f1-score': 0.9228305532228592, 'support': 2009}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9007707129094412, 'recall': 0.9312749003984063, 'f1-score': 0.9157688540646425, 'support': 1004}, '1': {'precision': 0.9289392378990731, 'recall': 0.8975124378109453, 'f1-score': 0.9129554655870445, 'support': 1005}, 'accuracy': 0.9143852663016426, 'macro avg': {'precision': 0.9148549754042572, 'recall': 0.9143936691046758, 'f1-score': 0.9143621598258436, 'support': 2009}, 'weighted avg': {'



*** K-Fold Split Results ***
{'0': {'precision': 0.8990291262135922, 'recall': 0.9223107569721115, 'f1-score': 0.9105211406096362, 'support': 1004}, '1': {'precision': 0.9203268641470889, 'recall': 0.8965174129353234, 'f1-score': 0.9082661290322581, 'support': 1005}, 'accuracy': 0.9094076655052264, 'macro avg': {'precision': 0.9096779951803405, 'recall': 0.9094140849537174, 'f1-score': 0.9093936348209471, 'support': 2009}, 'weighted avg': {'precision': 0.9096832957622055, 'recall': 0.9094076655052264, 'f1-score': 0.9093930735935759, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9115577889447236, 'recall': 0.9033864541832669, 'f1-score': 0.9074537268634316, 'support': 1004}, '1': {'precision': 0.9043392504930966, 'recall': 0.9124378109452737, 'f1-score': 0.9083704804358593, 'support': 1005}, 'accuracy': 0.9079143852663016, 'macro avg': {'precision': 0.9079485197189101, 'recall': 0.9079121325642703, 'f1-score': 0.9079121036496455, 'support': 2009}, 'weighted avg': {'precision': 0.9079467231687728, 'recall': 0.9079143852663016, 'f1-score': 0.907912331811311, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9083743842364532, 'recall': 0.9174129353233831, 'f1-score': 0.9128712871287128, 'support': 1005}, '1': {'precision': 0.9165829145728643, 'recall': 0.9074626865671642, 'f1-score': 0.912, 'support': 1005}, 'accuracy': 0.9124378109452737, 'macro avg': {'precision': 0.9124786494046588, 'recall': 0.9124378109452737, 'f1-score': 0.9124356435643564, 'support': 2010}, 'weighted avg': {'precision': 0.9124786494046588, 'recall': 0.9124378109452737, 'f1-score': 0.9124356435643565, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9119119119119119, 'recall': 0.9064676616915422, 'f1-score': 0.9091816367265468, 'support': 1005}, '1': {'precision': 0.9070227497527201, 'recall': 0.9124378109452737, 'f1-score': 0.9097222222222223, 'support': 1005}, 'accuracy': 0.909452736318408, 'macro avg': {'precision': 0.909467330832316, 'recall': 0.909452736318408, 'f1-score': 0.9094519294743846, 'support': 2010}, 'weighted avg': {'precision': 0.909467330832316, 'recall': 0.909452736318408, 'f1-score': 0.9094519294743845, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9109792284866469, 'recall': 0.9164179104477612, 'f1-score': 0.9136904761904763, 'support': 1005}, '1': {'precision': 0.9159159159159159, 'recall': 0.9104477611940298, 'f1-score': 0.9131736526946108, 'support': 1005}, 'accuracy': 0.9134328358208955, 'macro avg': {'precision': 0.9134475722012814, 'recall': 0.9134328358208955, 'f1-score': 0.9134320644425435, 'support': 2010}, 'weighted avg': {'precision': 0.9134475722012814, 'recall': 0.9134328358208955, 'f1-score': 0.9134320644425435, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.8980769230769231, 'recall': 0.9293532338308458, 'f1-score': 0.9134474327628362, 'support': 1005}, '1': {'precision': 0.9268041237113402, 'recall': 0.8945273631840795, 'f1-score': 0.910379746835443, 'support': 1005}, 'accuracy': 0.9119402985074627, 'macro avg': {'precision': 0.9124405233941317, 'recall': 0.9119402985074627, 'f1-score': 0.9119135897991396, 'support': 2010}, 'weighted avg': {'precision': 0.9124405233941317, 'recall': 0.9119402985074627, 'f1-score': 0.9119135897991395, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9104330708661418, 'recall': 0.9203980099502488, 'f1-score': 0.9153884215734786, 'support': 1005}, '1': {'precision': 0.919436052366566, 'recall': 0.9093625498007968, 'f1-score': 0.914371557336004, 'support': 1004}, 'accuracy': 0.9148830263812843, 'macro avg': {'precision': 0.9149345616163539, 'recall': 0.9148802798755228, 'f1-score': 0.9148799894547412, 'support': 2009}, 'weighted avg': {'precision': 0.9149323209539596, 'recall': 0.9148830263812843, 'f1-score': 0.9148802425319531, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9067961165048544, 'recall': 0.9293532338308458, 'f1-score': 0.9179361179361178, 'support': 1005}, '1': {'precision': 0.9274770173646578, 'recall': 0.9043824701195219, 'f1-score': 0.9157841654059505, 'support': 1004}, 'accuracy': 0.9168740666998507, 'macro avg': {'precision': 0.9171365669347561, 'recall': 0.9168678519751838, 'f1-score': 0.9168601416710342, 'support': 2009}, 'weighted avg': {'precision': 0.9171314198713266, 'recall': 0.9168740666998507, 'f1-score': 0.9168606772490655, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9185700099304865, 'recall': 0.9203980099502488, 'f1-score': 0.9194831013916502, 'support': 1005}, '1': {'precision': 0.9201596806387226, 'recall': 0.9183266932270916, 'f1-score': 0.9192422731804586, 'support': 1004}, 'accuracy': 0.9193628670980587, 'macro avg': {'precision': 0.9193648452846046, 'recall': 0.9193623515886702, 'f1-score': 0.9193626872860544, 'support': 2009}, 'weighted avg': {'precision': 0.9193644496472954, 'recall': 0.9193628670980587, 'f1-score': 0.9193627472233892, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9109390125847048, 'recall': 0.9372509960159362, 'f1-score': 0.9239077074128621, 'support': 1004}, '1': {'precision': 0.9354508196721312, 'recall': 0.9084577114427861, 'f1-score': 0.9217566885411409, 'support': 1005}, 'accuracy': 0.9228471876555501, 'macro avg': {'precision': 0.923194916128418, 'recall': 0.9228543537293612, 'f1-score': 0.9228321979770016, 'support': 2009}, 'weighted avg': {'precision': 0.923201016627942, 'recall': 0.9228471876555501, 'f1-score': 0.9228316626313392, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9133858267716536, 'recall': 0.9243027888446215, 'f1-score': 0.9188118811881189, 'support': 1004}, '1': {'precision': 0.9234642497482377, 'recall': 0.9124378109452737, 'f1-score': 0.9179179179179179, 'support': 1005}, 'accuracy': 0.9183673469387755, 'macro avg': {'precision': 0.9184250382599457, 'recall': 0.9183702998949476, 'f1-score': 0.9183648995530184, 'support': 2009}, 'weighted avg': {'precision': 0.9184275465782574, 'recall': 0.9183673469387755, 'f1-score': 0.9183646770634041, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9175879396984925, 'recall': 0.9093625498007968, 'f1-score': 0.9134567283641822, 'support': 1004}, '1': {'precision': 0.9102564102564102, 'recall': 0.918407960199005, 'f1-score': 0.9143140168400199, 'support': 1005}, 'accuracy': 0.913887506222001, 'macro avg': {'precision': 0.9139221749774513, 'recall': 0.9138852549999008, 'f1-score': 0.913885372602101, 'support': 2009}, 'weighted avg': {'precision': 0.9139203503061119, 'recall': 0.913887506222001, 'f1-score': 0.9138855859640911, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9067713444553483, 'recall': 0.9194029850746268, 'f1-score': 0.9130434782608694, 'support': 1005}, '1': {'precision': 0.9182643794147326, 'recall': 0.9054726368159204, 'f1-score': 0.9118236472945891, 'support': 1005}, 'accuracy': 0.9124378109452737, 'macro avg': {'precision': 0.9125178619350405, 'recall': 0.9124378109452735, 'f1-score': 0.9124335627777292, 'support': 2010}, 'weighted avg': {'precision': 0.9125178619350405, 'recall': 0.9124378109452737, 'f1-score': 0.9124335627777294, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9073170731707317, 'recall': 0.9253731343283582, 'f1-score': 0.9162561576354681, 'support': 1005}, '1': {'precision': 0.9238578680203046, 'recall': 0.9054726368159204, 'f1-score': 0.9145728643216081, 'support': 1005}, 'accuracy': 0.9154228855721394, 'macro avg': {'precision': 0.9155874705955181, 'recall': 0.9154228855721394, 'f1-score': 0.9154145109785381, 'support': 2010}, 'weighted avg': {'precision': 0.9155874705955181, 'recall': 0.9154228855721394, 'f1-score': 0.915414510978538, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.916076845298281, 'recall': 0.9014925373134328, 'f1-score': 0.9087261785356068, 'support': 1005}, '1': {'precision': 0.9030362389813908, 'recall': 0.9174129353233831, 'f1-score': 0.9101678183613031, 'support': 1005}, 'accuracy': 0.909452736318408, 'macro avg': {'precision': 0.9095565421398359, 'recall': 0.909452736318408, 'f1-score': 0.909446998448455, 'support': 2010}, 'weighted avg': {'precision': 0.9095565421398359, 'recall': 0.909452736318408, 'f1-score': 0.9094469984484549, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9086395233366436, 'recall': 0.9104477611940298, 'f1-score': 0.9095427435387673, 'support': 1005}, '1': {'precision': 0.9102691924227319, 'recall': 0.9084577114427861, 'f1-score': 0.9093625498007969, 'support': 1005}, 'accuracy': 0.909452736318408, 'macro avg': {'precision': 0.9094543578796876, 'recall': 0.909452736318408, 'f1-score': 0.9094526466697821, 'support': 2010}, 'weighted avg': {'precision': 0.9094543578796876, 'recall': 0.909452736318408, 'f1-score': 0.9094526466697822, 'support': 2010}}




*** K-Fold Split Results ***
{'0': {'precision': 0.920892494929006, 'recall': 0.9034825870646767, 'f1-score': 0.9121044701155198, 'support': 1005}, '1': {'precision': 0.9051808406647116, 'recall': 0.9223107569721115, 'f1-score': 0.9136655155402071, 'support': 1004}, 'accuracy': 0.9128919860627178, 'macro avg': {'precision': 0.9130366677968589, 'recall': 0.9128966720183941, 'f1-score': 0.9128849928278635, 'support': 2009}, 'weighted avg': {'precision': 0.9130405781139977, 'recall': 0.9128919860627178, 'f1-score': 0.912884604314816, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9151873767258383, 'recall': 0.9233830845771144, 'f1-score': 0.9192669638434869, 'support': 1005}, '1': {'precision': 0.9226130653266331, 'recall': 0.9143426294820717, 'f1-score': 0.9184592296148073, 'support': 1004}, 'accuracy': 0.9188651070184172, 'macro avg': {'precision': 0.9189002210262357, 'recall': 0.918862857029593, 'f1-score': 0.9188630967291471, 'support': 2009}, 'weighted avg': {'precision': 0.918898372920561, 'recall': 0.9188651070184172, 'f1-score': 0.9188632977580741, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9051724137931034, 'recall': 0.9402985074626866, 'f1-score': 0.9224011713030746, 'support': 1005}, '1': {'precision': 0.9378238341968912, 'recall': 0.901394422310757, 'f1-score': 0.9192483494159471, 'support': 1004}, 'accuracy': 0.9208561473369836, 'macro avg': {'precision': 0.9214981239949973, 'recall': 0.9208464648867218, 'f1-score': 0.9208247603595109, 'support': 2009}, 'weighted avg': {'precision': 0.921489997708187, 'recall': 0.9208561473369836, 'f1-score': 0.9208255450339478, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.8980769230769231, 'recall': 0.9302788844621513, 'f1-score': 0.913894324853229, 'support': 1004}, '1': {'precision': 0.9277605779153767, 'recall': 0.8945273631840795, 'f1-score': 0.9108409321175277, 'support': 1005}, 'accuracy': 0.9123942259830762, 'macro avg': {'precision': 0.9129187504961499, 'recall': 0.9124031238231154, 'f1-score': 0.9123676284853783, 'support': 2009}, 'weighted avg': {'precision': 0.9129261381653482, 'recall': 0.9123942259830762, 'f1-score': 0.9123668685568727, 'support': 2009}}




*** K-Fold Split Results ***
{'0': {'precision': 0.9083011583011583, 'recall': 0.9372509960159362, 'f1-score': 0.9225490196078432, 'support': 1004}, '1': {'precision': 0.935251798561151, 'recall': 0.9054726368159204, 'f1-score': 0.9201213346814964, 'support': 1005}, 'accuracy': 0.9213539074166252, 'macro avg': {'precision': 0.9217764784311546, 'recall': 0.9213618164159283, 'f1-score': 0.9213351771446698, 'support': 2009}, 'weighted avg': {'precision': 0.9217831859075759, 'recall': 0.9213539074166252, 'f1-score': 0.9213345729423487, 'support': 2009}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9171656686626747, 'recall': 0.9153386454183267, 'f1-score': 0.9162512462612165, 'support': 1004}, '1': {'precision': 0.9155908639523337, 'recall': 0.9174129353233831, 'f1-score': 0.9165009940357853, 'support': 1005}, 'accuracy': 0.9163763066202091, 'macro avg': {'precision': 0.9163782663075042, 'recall': 0.9163757903708549, 'f1-score': 0.9163761201485009, 'support': 2009}, 'weighted avg': {



In [30]:
# calculate average metrics for submission
import numpy
final_dict = {}
final_dict['0-precision'] = []
final_dict['0-recall'] = []
final_dict['0-f1'] = []
final_dict['1-precision'] = []
final_dict['1-recall'] = []
final_dict['1-f1'] = []
final_dict['accuracy'] = []

for dictionary in classification_report_list:
    for key, inner_dict in dictionary.items():
        if key == '0':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['0-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['0-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['0-f1'].append(inner_dict['f1-score'])
        if key == '1':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['1-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['1-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['1-f1'].append(inner_dict['f1-score'])
        if key == "accuracy":
            final_dict['accuracy'].append(dictionary['accuracy'])


for key in final_dict.keys():
    final_dict[key] = numpy.average(final_dict[key])

print("******** NN FINAL DICT ********")
print(final_dict)

******** NN FINAL DICT ********
{'0-precision': 0.9095488229235198, '0-recall': 0.9210384994020602, '0-f1': 0.9151885904652037, '1-precision': 0.9201794752698322, '1-recall': 0.9082988113879474, '1-f1': 0.9141281012390716, 'accuracy': 0.9146682383668185}


In [31]:
# classification report on CV with VotingClassifier
import numpy as np
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, make_scorer

# define model
model = RandomForestClassifier(random_state = 42)

classification_report_list = []
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
for train_index, test_index in cv.split(X, y):
    x_train, x_test = np.array(X)[train_index], np.array(X)[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("*** K-Fold Split Results ***")
    classification_report_results = classification_report(y_test, y_pred, output_dict=True)
    print(classification_report_results)
    classification_report_list.append(classification_report_results)

*** K-Fold Split Results ***
{'0': {'precision': 0.9104638619201726, 'recall': 0.8398009950248756, 'f1-score': 0.8737060041407867, 'support': 1005}, '1': {'precision': 0.8513388734995383, 'recall': 0.9174129353233831, 'f1-score': 0.8831417624521072, 'support': 1005}, 'accuracy': 0.8786069651741294, 'macro avg': {'precision': 0.8809013677098554, 'recall': 0.8786069651741293, 'f1-score': 0.8784238832964469, 'support': 2010}, 'weighted avg': {'precision': 0.8809013677098555, 'recall': 0.8786069651741294, 'f1-score': 0.8784238832964469, 'support': 2010}}
*** K-Fold Split Results ***
{'0': {'precision': 0.9114194236926361, 'recall': 0.8497512437810946, 'f1-score': 0.8795056642636457, 'support': 1005}, '1': {'precision': 0.8592730661696178, 'recall': 0.9174129353233831, 'f1-score': 0.8873917228103946, 'support': 1005}, 'accuracy': 0.8835820895522388, 'macro avg': {'precision': 0.8853462449311269, 'recall': 0.8835820895522388, 'f1-score': 0.8834486935370202, 'support': 2010}, 'weighted avg': 

In [32]:
# calculate average metrics for submission
import numpy
final_dict = {}
final_dict['0-precision'] = []
final_dict['0-recall'] = []
final_dict['0-f1'] = []
final_dict['1-precision'] = []
final_dict['1-recall'] = []
final_dict['1-f1'] = []
final_dict['accuracy'] = []

for dictionary in classification_report_list:
    for key, inner_dict in dictionary.items():
        if key == '0':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['0-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['0-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['0-f1'].append(inner_dict['f1-score'])
        if key == '1':
            for inner_key in inner_dict.keys():
                if inner_key == "precision":
                    final_dict['1-precision'].append(inner_dict['precision'])
                if inner_key == "recall":
                    final_dict['1-recall'].append(inner_dict['recall'])
                if inner_key == "f1-score":
                    final_dict['1-f1'].append(inner_dict['f1-score'])
        if key == "accuracy":
            final_dict['accuracy'].append(dictionary['accuracy'])


for key in final_dict.keys():
    final_dict[key] = numpy.average(final_dict[key])

print("******** RF FINAL DICT ********")
print(final_dict)

******** RF FINAL DICT ********
{'0-precision': 0.9035907867310111, '0-recall': 0.8663618824866374, '0-f1': 0.8844828509334206, '1-precision': 0.8718672346884717, '1-recall': 0.9074679391885195, '1-f1': 0.8892238602355227, 'accuracy': 0.8869151092059199}
