In [388]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [389]:
points_file = open('StudentGrades.txt', 'r')
lines = points_file.readlines()
lines = [line.rstrip() for line in lines]
lines = [line for line in lines if line]
lines

['GRADES - MIDTERM 1',
 '------------------',
 'S1 - 78',
 'S2 - 82',
 'S3 - 77',
 'S4 - 75',
 'S5 - 67',
 'S6 - 71',
 'S7 - 64',
 'S8 - 92',
 'S9 - 80',
 'S10 - 89',
 'GRADES - MIDTERM 2',
 '------------------',
 'S1 - 82',
 'S2 - 85',
 'S3 - 90',
 'S4 - 77',
 'S5 - 77',
 'S6 - 64',
 'S7 - 33',
 'S8 - 88',
 'S9 - 39',
 'S10 - 64',
 'GRADES - FINAL (OUT OF 200)',
 '---------------------------',
 'S1 - 182',
 'S2 - 180',
 'S3 - 188',
 'S4 - 149',
 'S5 - 157',
 'S6 - 175',
 'S7 - 110',
 'S8 - 184',
 'S9 - 126',
 'S10 - 116']

In [3]:
midterm_points = {}
midterm2_points = {}
final_points = {}

for i, line in enumerate(lines):
    if "MIDTERM 1" in line:
        for midterm_line in lines[i+2:i+12]:
            student = midterm_line
            if 'S10' in student:
                student = midterm_line[:3]
            else:
                student = midterm_line[:2]
            grade = midterm_line[-2:]
            midterm_points[student] = int(grade)
            
    if 'MIDTERM 2' in line:
        for midterm2_line in lines[i+2:i+12]:
            student = midterm2_line
            if 'S10' in student:
                student = midterm2_line[:3]
            else:
                student = midterm2_line[:2]
            grade = midterm2_line[-2:]
            midterm2_points[student] = int(grade)
            
    if 'FINAL' in line:
        for final_line in lines[i+2:i+12]:
            student = final_line
            if 'S10' in student:
                student = final_line[:3]
            else:
                student = final_line[:2]
            grade = final_line[-3:]
            final_points[student] = int(grade)/2

final_points

{'S1': 91.0,
 'S2': 90.0,
 'S3': 94.0,
 'S4': 74.5,
 'S5': 78.5,
 'S6': 87.5,
 'S7': 55.0,
 'S8': 92.0,
 'S9': 63.0,
 'S10': 58.0}

In [4]:
from scipy.signal import butter, filtfilt

def lowpass_butter_filter(data, cutoff, sampling_frequency, order=4):
    nyq = 0.5 * sampling_frequency
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    y = filtfilt(b, a, data, axis=0)

    return pd.DataFrame(y, columns=[data.columns[0]])

<h3>Koristili smo niskopropusni butter filter kao i u članku da izdvojimo tonic component signala kojeg mjerimo.</h3>

In [5]:
cutoff_freq = pd.DataFrame(
    {
        'Midterm 1': [0.002, 0.002, 0.0002, 0.002, 0.001, 0.002, 0.001, 0.0002, 0.002, 0.002],
        'Midterm 2': [0.002, 0.001, 0.0002, 0.002, 0.001, 0.002, 0.0002, 0.002, 0.002, 0.002],
        'Final': [0.002, 0.002, 0.001, 0.002, 0.001, 0.002, 0.0002, 0.002, 0.002, 0.002],
    }
)
cutoff_freq

Unnamed: 0,Midterm 1,Midterm 2,Final
0,0.002,0.002,0.002
1,0.002,0.001,0.002
2,0.0002,0.0002,0.001
3,0.002,0.002,0.002
4,0.001,0.001,0.001
5,0.002,0.002,0.002
6,0.001,0.0002,0.0002
7,0.0002,0.002,0.002
8,0.002,0.002,0.002
9,0.002,0.002,0.002


<h3>Za cutoff frekvencije poslužili smo se tablicom iz članka te ih pohranili kao dictionary.</h3>

In [227]:
time_reads = [5,15,30]
tests = ['Midterm 1', 'Midterm 2', 'Final']
eda_lengths = {}
hr_lengths = {}
temp_lengths = {}

for test in tests:
    eda_len = []
    hr_len = []
    temp_len = []
    for i in range(1,11):
        eda_df = pd.read_csv('Data/S' + str(i) + '/' + str(test) + '/EDA.csv')
        eda_len.append(len(eda_df))
        hr_df = pd.read_csv('Data/S' + str(i) + '/' + str(test) + '/HR.csv')
        hr_len.append(len(hr_df))
        temp_df = pd.read_csv('Data/S' + str(i) + '/' + str(test) + '/TEMP.csv')
        temp_len.append(len(temp_df))
    print('EDA_' + str(test) + ' : ' + str(eda_len))
    print('HR_' + str(test) + ' : ' + str(hr_len))
    print('IBI_' + str(test) + ' : ' + str(temp_len))
    print('----------------------------------------------------------------------------------------------------')
    eda_lengths[test] = max(eda_len)
    hr_lengths[test] = max(hr_len)
    temp_lengths[test] = max(temp_len) 
        
eda_lengths

EDA_Midterm 1 : [44713, 47935, 48853, 46777, 48007, 44611, 49537, 43123, 50725, 46747]
HR_Midterm 1 : [11170, 11976, 12204, 11685, 11993, 11145, 12375, 10772, 12673, 11679]
IBI_Midterm 1 : [44713, 47937, 48849, 46777, 48001, 44609, 49529, 43121, 50721, 46745]
----------------------------------------------------------------------------------------------------
EDA_Midterm 2 : [44545, 55537, 40981, 53569, 48067, 56761, 43255, 39847, 49729, 51967]
HR_Midterm 2 : [11128, 13876, 10236, 13384, 12008, 14182, 10805, 9953, 12423, 12983]
IBI_Midterm 2 : [44545, 55537, 40977, 53569, 48065, 56761, 43257, 39841, 49721, 51969]
----------------------------------------------------------------------------------------------------
EDA_Final : [93583, 101347, 103285, 63667, 60997, 95683, 78595, 71587, 56827, 92293]
HR_Final : [23388, 25328, 25813, 15908, 15241, 23913, 19640, 17888, 14198, 23065]
IBI_Final : [93585, 101345, 103289, 63633, 60969, 95657, 78593, 71585, 56825, 92289]
---------------------------

{'Midterm 1': 50725, 'Midterm 2': 56761, 'Final': 103285}

<h3>Ovdje smo koristili 3 indikatora(EDA, HR, TEMP) za koja smo odlučili izvući maksimum veličinu datoteka za svaki student na svakom ispitu.</h3>

In [237]:
def features_of_indicator(indicator, test, segm_block, i):
    indicator_df = pd.read_csv('Data/S' + str(i) + '/' + str(test) + '/' + str(indicator))
    indicator_df.columns=['Value']
    #print(indicator_df)
    indicator_sample_rate = int(indicator_df.at[0, 'Value'])
    #print(indicator_sample_rate)
    indicator_df = indicator_df.iloc[indicator_sample_rate*2*60*5:]
    
    cutoff = cutoff_freq.at[i-1, test]
    df = lowpass_butter_filter(indicator_df, cutoff, 4.000)
    
    if indicator=='EDA.csv':
        empty_df = pd.DataFrame('aubergine', index=range(df.shape[0], eda_lengths[test]), columns=['Value'])
    if indicator=='HR.csv':
        empty_df = pd.DataFrame('aubergine', index=range(df.shape[0], hr_lengths[test]), columns=['Value'])
    if indicator=='TEMP.csv':
        empty_df = pd.DataFrame('aubergine', index=range(df.shape[0], temp_lengths[test]), columns=['Value'])
        
    #print('Duljina praznog: ' + str(len(empty_df)))
    df = pd.concat([df, empty_df])
    # print(df)
    list_of_means = []
    list_of_var = []
    
    range1 = segm_block*indicator_sample_rate
    index_row=0
    while True:
        data = df.iloc[range(index_row,index_row+range1)]
        # print(data)
        # print(data['Value'].dtype)
        if 'aubergine' in str(data['Value']):
            #print('Evo meeeeeeeeeeeeeeee')
            list_of_means.append(np.nan)
            list_of_var.append(np.nan)
        else:            
            #print(np.mean(data['Value']))
            list_of_means.append(np.mean(data['Value']))
            list_of_var.append(np.var(data['Value']))
        
        index_row = index_row + range1  
        if index_row + range1 > df.shape[0]:
            # print('Lista meanova: ' + str(list_of_means))
            # print('Lista varova: ' + str(list_of_var))
            break  
    
    ro_list = []
    for j, mi in enumerate(list_of_means):
        if j == 0 or j==len(list_of_means) - 1:
            continue
        else:
            if np.any(list_of_means[j-1])==False and np.any(list_of_means[j+1])==False:
                #print('Evo me')
                ro_list.append(np.nan)
                continue
            else:
                ro_list.append(mi/(list_of_means[j-1] + list_of_means[j+1]))
    #print('Lista ro-ova: ' + str(ro_list))
    
    last_data = indicator_df[indicator_df.shape[0] - range1:indicator_df.shape[0]]
    first_data = indicator_df[:range1]
    
    diff = np.array(last_data) - np.array(first_data)
    #print(str(diff))
    mi_diff = np.mean (diff)
    sigma_diff = np.var (diff)
    
    feature_list = list_of_means + list_of_var + ro_list
    feature_list.append(mi_diff)
    feature_list.append(sigma_diff)
    
    #print(feature_list)   
    
    return feature_list

<h3>Ova funkcija nam je dobro došla da pokušamo izvući listu featurea za svaki indikator koji smo koristili.</h3>

In [346]:
#Konačni feature list extractain iz odrezanog EDA
def feature_engineer_from_test(test, time_read):
    all_students = []
    for i in range (1, 11):      
        #ucitavanje, uzimanje y stupca, micanje retka sa frekvencijom
        segm_block = time_read*60*2

        #indicators = ['EDA.csv', 'HR.csv', 'TEMP.csv']
        indicators = ['EDA.csv']
        features = []
        for ind in indicators:
            features += features_of_indicator(ind, test, segm_block, i)

        all_students.append(features)
    return pd.DataFrame(all_students)

<h3>Ovom funkcijom smo koristili konačni dataset na kojem kasnije treniramo modele.</h3>

In [347]:
midterm1_5min = feature_engineer_from_test('Midterm 1', 5)
midterm2_5min = feature_engineer_from_test('Midterm 2', 5)
final_5min = feature_engineer_from_test('Final', 5)

In [348]:
midterm1_5min = midterm1_5min[midterm1_5min.columns[midterm1_5min.isnull().mean() < 0.5]]
midterm2_5min = midterm2_5min[midterm2_5min.columns[midterm2_5min.isnull().mean() < 0.5]]
final_5min = final_5min[final_5min.columns[final_5min.isnull().mean() < 0.5]]

<h3>Ovdje smo izbacili sve stupce koje imaju više od 50% nepostojećih vrijednosti.</h3>

In [349]:
midterm1_5min

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,61,62
0,0.023558,0.024013,0.024253,0.020122,0.350664,0.599025,1.426195,1.832469,0.59183,0.150289,...,0.211066,0.339103,0.663466,0.496044,0.488026,0.598112,0.556161,,0.002851828,2.622926e-06
1,0.011017,0.011696,0.012597,0.011563,-0.002815,0.806185,0.369805,0.248863,0.231293,0.292199,...,0.501149,0.769172,0.361509,0.400242,0.423582,0.252625,2.860007,0.033626,0.002057328,2.196158e-06
2,-0.035269,0.05456,0.204552,0.381441,0.527286,0.587605,0.544867,0.42937,0.299409,0.207294,...,0.435735,0.434535,0.469434,0.504416,0.521269,0.521193,0.509957,0.496633,0.001191318,1.786869e-06
3,6.5e-05,0.000899,0.003426,1.362053,2.168277,1.289558,0.482646,0.401186,0.209574,0.138801,...,0.437893,0.453566,0.481797,0.521145,0.466347,0.500743,1.022696,0.003908,0.0,0.0
4,0.000213,0.001649,-0.017824,0.207071,0.620607,0.239847,0.271538,0.124322,0.15433,0.192809,...,0.60641,0.428813,0.550139,0.439089,0.48474,0.780323,0.495411,-0.064857,0.0,0.0
5,3e-06,1.3e-05,0.000132,0.002403,0.286172,1.409713,1.277045,1.338024,1.3812,1.14945,...,0.481332,0.487067,0.541821,0.506979,0.335201,0.663305,0.791324,,-5.3375e-07,6.834489e-10
6,2.2e-05,-0.000275,0.001705,-0.00852,0.138719,0.539415,0.468113,0.257041,0.244467,0.298659,...,0.642723,0.390334,0.584781,0.548509,0.357762,0.809513,0.504685,-0.060641,7.046042e-05,4.624621e-06
7,-0.012832,-0.002033,0.028221,0.08195,0.158949,0.256792,0.374482,0.51093,0.657904,0.795957,...,0.511511,0.515443,0.514133,0.508219,0.500536,0.495274,,,0.003603751,2.14623e-06
8,0.014018,0.013829,0.014098,0.070113,0.252524,0.192597,0.202117,0.252668,0.290571,0.312583,...,0.507965,0.527011,0.472908,0.489873,0.517412,0.656935,0.507179,0.086979,0.0004133204,2.032273e-06
9,0.019049,0.019262,0.019729,0.018544,0.126378,0.365495,0.319947,0.283292,0.334114,0.370694,...,0.531852,0.49762,0.493826,0.506593,0.500296,0.659052,0.470294,0.120673,0.003154166,0.0001089506


In [350]:
def impute_scale_and_generate_PCA(df):
    #df = df[df.columns[df.isnull().mean]]
    for column in df.columns:
        df[column].fillna(np.mean(df[column]), inplace=True)
        df[column] = StandardScaler().fit_transform(df[[column]])
    #print(df)
    #df.to_excel('Imputed_DF.xlsx')    
    i = 2
    while True:
        pca = PCA(n_components=i).fit(df)
        if np.sum(pca.explained_variance_ratio_) >= 0.9:
            if i < 10:
                i=i+1
            else:
                return pca.transform(df), pca.explained_variance_ratio_
        else:
            i+=1

<h3>Ovom funkcijom pokušavam nadopuniti vrijednosti koje nedostaju, skalirati značajke i reducirati dimenzionalnost koriteći se PCA.</h3>

In [351]:
pca_M1, m1_variance = impute_scale_and_generate_PCA(midterm1_5min)
pca_M2, m2_variance = impute_scale_and_generate_PCA(midterm2_5min)
pca_F, F_variance = impute_scale_and_generate_PCA(final_5min)

In [352]:
df_M1 = pd.DataFrame(pca_M1, columns=['Component_' + str(i) for i, value in enumerate(pca_M1[0])])
df_M1

Unnamed: 0,Component_0,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9
0,-0.77503,-1.96559,-4.395038,5.292988,-3.513005,-0.500861,-1.026051,-0.030864,-0.004069,-8.178174e-17
1,-0.312785,7.76382,3.01259,1.730476,-0.675348,-1.616771,-0.823751,0.122827,-0.02978,-1.807807e-15
2,-4.220553,-4.283262,5.196817,3.065204,2.653692,-0.686572,0.246978,0.223713,0.044902,-1.371131e-16
3,-2.571081,2.501809,-3.474955,0.812811,4.048913,3.64209,0.316428,0.292664,-0.185734,1.321295e-15
4,-1.806319,-1.070597,-1.623628,-3.610754,1.413449,-1.945126,-1.933273,-2.009601,-0.89481,4.453896e-17
5,10.315835,-0.949037,0.133663,1.313888,2.15541,-0.433566,0.652216,-0.502643,-0.039307,4.979614e-17
6,-0.054073,-0.807881,-2.075608,-3.056722,0.606929,-2.270026,0.013987,2.976629,0.389362,7.270487e-16
7,1.941033,-1.382929,3.160474,-2.37375,-2.88873,3.501653,-1.996814,0.615006,-0.091275,6.282622e-17
8,-1.344107,0.095299,-0.313411,-1.83049,-1.026958,0.148068,1.188448,-1.483706,2.115815,3.025679e-16
9,-1.17292,0.098369,0.379096,-1.343651,-2.774352,0.161112,3.361831,-0.204025,-1.305104,3.896084e-16


In [353]:
df_M2 = pd.DataFrame(pca_M2, columns=['Component_' + str(i) for i, value in enumerate(pca_M2[0])])
df_F = pd.DataFrame(pca_F, columns=['Component_' + str(i) for i, value in enumerate(pca_F[0])])
df_F

Unnamed: 0,Component_0,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9
0,-5.978851,-4.978782,1.201851,-1.237065,-1.578874,0.209219,1.210325,-1.473791,-1.995383,7.70019e-16
1,-4.451987,-3.138685,-0.372273,0.345233,0.210737,-1.990393,-2.284554,2.889016,-0.445683,9.747268e-16
2,0.943059,3.490255,-4.504805,7.35589,-4.429999,-0.117029,0.151446,-0.24178,-0.139138,7.510723e-16
3,2.463786,4.399248,-5.70903,-6.86771,-1.143671,-2.825399,0.376668,-0.167291,-0.036334,-7.62228e-16
4,3.967524,1.072311,0.37505,-2.485272,-1.087034,6.353345,-1.845226,0.084318,-0.205025,6.310062e-16
5,-2.500734,1.801576,-3.69427,2.25192,7.7982,0.86234,0.115446,-0.657014,-0.136092,-9.066645e-16
6,-2.621934,7.689498,8.610175,0.1816,0.140605,-1.488083,-0.320277,-0.332551,0.030818,-1.298032e-16
7,12.815709,-4.701917,2.712705,1.340882,1.279914,-2.367984,-0.308267,-0.550134,-0.132836,-1.487975e-15
8,0.610507,-1.154051,1.394811,-0.296756,0.005152,1.709464,3.748149,1.920325,0.788226,5.401911e-16
9,-5.247079,-4.479454,-0.014213,-0.58872,-1.19503,-0.345479,-0.84371,-1.471098,2.271446,7.872982e-16


In [354]:
#LABEL ENGINEERING
def label_engineering(points_data):
    y = []
    for el in points_data:
        if(points_data[el] >= 80):
            y.append(1)
        else:
            y.append(0)
    #print(pd.DataFrame(y))
    return pd.DataFrame(y)

midterm_y = label_engineering(midterm_points)
midterm2_y = label_engineering(midterm2_points)
final_y = label_engineering(final_points)

Promijenjene su oznake za studente koji su ostvarii 80+ posto se postavlja 1, a za one koji su manje je 0.

In [386]:
def new_MLmodel(dataframe, y):
    kf = KFold(n_splits=10)
    print(kf.split(dataframe))
    score = []
    for i, (train_index, test_index) in enumerate(kf.split(dataframe)):
        # print(f"Fold {i}:")
        # print(f"  Train: index={train_index}")
        # print(f"  Test:  index={test_index}")
        
        x_train = dataframe.loc[train_index]
        x_test = dataframe.loc[test_index]
        y_train = y.loc[train_index]
        y_test = y.loc[test_index]
        
        # params = {'n_neighbors':list(range(1,9,2)), 'weights':('distance','uniform')}
        # knc = KNeighborsClassifier()
        # clf = GridSearchCV(knc, param_grid=params,cv=2,n_jobs=-1)
        # clf.fit(x_train, y_train)
        
        # neigh = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], 
        #                              weights=clf.best_params_['weights']).fit(np.array(x_train), np.array(y_train))
    
        #neigh = KNeighborsClassifier(n_neighbors=3).fit(np.array(x_train), np.array(y_train))
        
        #TOČNOST: 60, 30, 50
        
        
        # params = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':np.linspace(1, 100, 50)}
        # knc = SVC()
        # clf = GridSearchCV(knc, param_grid=params,cv=2,n_jobs=-1)
        # clf.fit(x_train, y_train)   
        
        # neigh = SVC(kernel=clf.best_params_['kernel'], C=clf.best_params_['C'], degree=3).fit(np.array(x_train), np.array(y_train))
        # TOČNOST: 70, 20, 20
        
        params = {'max_depth':list(range(1,7)), 'criterion':('gini','entropy', 'log_loss')}
        knc = DecisionTreeClassifier()
        clf = GridSearchCV(knc, param_grid=params,cv=4,n_jobs=-1)
        clf.fit(x_train, y_train)
        print('Best params: max_depth: ' + str(clf.best_params_['max_depth'])
               + ', criterion: ' + str(clf.best_params_['criterion']))
        
        neigh = DecisionTreeClassifier(max_depth=clf.best_params_['max_depth'], 
                                        criterion=clf.best_params_['criterion']).fit(np.array(x_train), np.array(y_train))
    
        # score.append(neigh.score(x_test, y_test))
        #TOČNOST: 80, 40, 60
        
        
        
    return (score.count(1.0)/len(score)) * 100

Koristili smo se 10Foldom da napravimo 10 različitih konfiguracija skupa za učenje i skupa za ispitivanje, tako da ima 9 primjera u skupu za učenje i samo 1 u skupu za treniranje.
Pokušali smo implementirati razne algoritme strojnog učenja da vidimo oćemo li dobiti efektivnije rezultate nego što su u članku.

In [387]:
print(f"Točnost za Midterm 1 (5 min): {new_MLmodel(df_M1, midterm_y)} %")
print(f"Točnost za Midterm 1 (5 min): {new_MLmodel(midterm1_5min, midterm_y)} %")
print(f"Točnost za Midterm 2 (5 min): {new_MLmodel(df_M2, midterm2_y)} %")
print(f"Točnost za Midterm 2 (5 min): {new_MLmodel(midterm2_5min, midterm2_y)} %")
print(f"Točnost za Final (5 min): {new_MLmodel(df_F, final_y)} %")
print(f"Točnost za Final (5 min): {new_MLmodel(final_5min, final_y)} %")

<generator object _BaseKFold.split at 0x00000255A5C02120>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 1.         0.83333333 1.         1.         0.8333333

Točnost za Midterm 1 (5 min): 80.0 %
<generator object _BaseKFold.split at 0x00000255A531F660>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.66666667 0.54166667 0.54166667 0.66666667 0.6666666

Točnost za Midterm 1 (5 min): 60.0 %
<generator object _BaseKFold.split at 0x00000255A531F510>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.66666667 0.66666667 0.66666667 0.66666667 0.75     

Točnost za Midterm 2 (5 min): 40.0 %
<generator object _BaseKFold.split at 0x00000255A531F510>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.41666667 0.54166667 0.54166667 0.54166667 0.5416666

Točnost za Midterm 2 (5 min): 60.0 %
<generator object _BaseKFold.split at 0x00000255A1290820>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.66666667 0.79166667 0.66666667 0.79166667 0.6666666

Točnost za Final (5 min): 60.0 %
<generator object _BaseKFold.split at 0x00000255A531F660>


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.5        0.41666667 0.79166667 0.25       0.25     

Točnost za Final (5 min): 30.0 %


24 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\eleko\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.08333333 0.08333333 0.08333333 0.08333333 0.0833333

<h3>Konačno, poušaj je bio neuspješan. Najbolji se pokazao Decision Tree Classifier koji je ostvario 80% točnosti na Midterm 1, 40% na Midterm 2 i 60% na Final. Iako smo ostvarili bolje rezultate za Midterm 1, na Midtermu 2 su vrlo loši. Čak i za ostale algoritme, na Midtermu smo ostvarili lošije rezultate. Final je najkaotičniji, budući da rezultati nekad mogu biti solidni a nekad loši s obzirom na algoritam.
Mislimo da je stvar u tome što su signali EDA veoma različiti za svakog studenta na Midterm 2, puno slučajniji i nepredvidiviji nego što su za druge ispite, te se zbog toga tu mogu vidjeti lošiji rezultati. Kako se vidi, ljudi koji su radili na datasetu u članku nisu imali toliko problema sa Midterm 2, tako da to uzimamo kao neuspjeh. </h3>