# Automatic Lab Evaluator

## Assessment based on student-provided results

Version History:

Version 0.1 - Jerónimo Arenas García, Dec. 2016

In [1]:
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
import scipy.io as sio
import scipy
import zipfile as zp
import shutil
import difflib

## 1. Read datafiles for all students

Student datafiles can be in any of the following formats:

   * `'.zip'`: When uncompressed, the zip may contain one or several matlab files. All matlab files are read and incorporated to a pandas Dataframe where each student is a column, and each index is a variable available for the exam solution
   * `.mat'`: All data variables for the students are given in a single matlab file

In [7]:
def readdatafiles(datafiles_path, splitsymbol):

    temporary_dir = './tmp'
    df = pd.DataFrame()
    
    datafiles = [f for f in os.listdir(datafiles_path) if isfile(join(datafiles_path, f))]
    for dtfile in datafiles:
        if dtfile.endswith('zip'):
            #All files will be extracted and the contents incorporated to the Dataframe
            NIA = dtfile.split(splitsymbol)[0]
            print(NIA)
            idx = []
            val = []
            zpobj = zp.ZipFile(join(datafiles_path, dtfile))
            for fileinzip in zpobj.namelist():
                if fileinzip.endswith('mat'):
                    #Matlab files are extracted to a temporal subfolder
                    zpobj.extract(fileinzip, temporary_dir)
                    data = sio.loadmat(join(temporary_dir,fileinzip))
                    #Read all variable names and the corresponding data values
                    for var in [el for el in data.keys() if not el.startswith('_')]:
                        idx.append(var)
                        val.append(data[var])
            #If
            if idx:
                df[NIA] = pd.Series(val,index=idx)
                
            #Remove temporary directory, if it has been created
            if os.path.exists(temporary_dir):
                shutil.rmtree(temporary_dir)
                    
        elif dtfile.endswith('mat'):
            NIA = dtfile.split(splitsymbol)[0]
                
    return df
        

In [8]:
###########################################
# EXAM DEPENDENT VARIABLE
datafiles_path = './datafiles_Gbil/'
##########################################

student_data = readdatafiles(datafiles_path, splitsymbol='.')

print('Number of students in dataframe:', str(student_data.shape[1]))
print('Number of variables read:', str(student_data.shape[0]))

print('Displaying data for first three students ... ')
student_data[student_data.columns[:3]]

100304972
100329922
100316478
100346898
100346250
100346708
100318675
100346579
100315121
100346595
100381208
Number of students in dataframe: 11
Number of variables read: 6
Displaying data for first three students ... 


Unnamed: 0,100304972,100329922,100316478
xTrain,"[[-6.15140536314, 2.39404062859, 1.23776108947...","[[-0.240276613092, -4.75160788411, -7.83453779...","[[0.60833122942, -1.90656845955, -2.6839553355..."
sTrain,"[[-0.133460836074], [0.926768792694], [0.39554...","[[-2.48879961347], [-0.0845507384467], [1.1063...","[[0.0382367974118], [-1.21628754435], [0.67094..."
xTest,"[[-1.88825185119, 0.751538247372, 3.7655315104...","[[-3.10613054662, 5.46232254359, -2.3689241897...","[[-0.748405211044, 2.01940410882, -3.507158243..."
Xtrain,"[[0.524111415921, 1.26504542985, 1.62115573805...","[[2.55454966472, -1.23192909859, 1.89475732147...","[[-1.2303566627, -1.49775473748, 0.46892128352..."
ytrain,"[[1], [0], [1], [1], [0], [1], [0], [0], [1], ...","[[1], [0], [0], [1], [0], [0], [1], [1], [1], ...","[[0], [0], [0], [0], [1], [0], [0], [1], [0], ..."
Xtest,"[[-0.087790282035, -1.53679230702, 1.745597762...","[[1.98606639599, 2.96860392974, 1.14929368573,...","[[1.07287779331, 0.921767482585, 2.89140408872..."


## 2. Read answers provided by students


### 2.1. Read student results into panda dataframe

In [12]:
###########################################
# EXAM DEPENDENT VARIABLE
results_path = './entregas_Gbil/'
#Requested variable names in the wording of the exam
truenames = ['vTrain', 'xnTrain', 'xnTest', 'we', 'w', 'EAP', 'm0', 'etaNPx1', 'PDx1', 'tm', 'tv', 'ytest']
###########################################

student_results = readdatafiles(results_path, splitsymbol='_')
newindex = truenames+[el for el in student_results.index.tolist() if el not in truenames]
student_results = student_results.reindex(newindex)

print('Number of students in dataframe:', str(student_results.shape[1]))
print('Number of variables read:', str(student_results.shape[0]))

print('Displaying data for first three students ... ')
student_results[student_results.columns[:3]]

LUIS ANTHONY SANTIVAÑEZ CALDAS
MATTHEW MOORCROFT
EDUARDO HERRERA ARRUTI
BEATRIZ GARCÍA SANZ
FELIPE BARBOSA MARTIN
DYLAN LEO PETERS
DIEGO PENROZ VALENZUELA
Number of students in dataframe: 7
Number of variables read: 19
Displaying data for first three students ... 


Unnamed: 0,LUIS ANTHONY SANTIVAÑEZ CALDAS,MATTHEW MOORCROFT,EDUARDO HERRERA ARRUTI
vTrain,"[[2.84267180525, 3.29710073682, 3.55345020725,...",[[4.86143219825]],[[6.19564562001]]
xnTrain,"[[0.389333222624, 0.518440394855, -0.439157914...","[[0.268863058695, 0.624357411398, 0.3029224977...","[[-1.70851177173, -2.36542590894, -3.525578104..."
xnTest,"[[0.489966246279, 0.580523495113, -0.087611225...","[[0.000181359421271, -1.15992919585, -0.470657...","[[-1.71696152202, -1.46488873093, -4.482509852..."
we,"[[0.675936758708], [0.276316396782], [0.347156...","[[-0.0811007827235], [0.451596275303], [0.0320...","[[-0.0931365877392], [0.0529499569442], [0.298..."
w,"[[-0.20381506644], [0.675936758708], [0.276316...",,
EAP,,,
m0,[[1.35747384376]],,"[[1.2110683868, 1.15174695558, 1.12668119413, ..."
etaNPx1,,,
PDx1,,,
tm,,,


### 2.2. Common Mistakes on variable names

In view of all variable names provided by all students, we may decide to allow alternative names for variables without any penalty

In [13]:
print('Number of students in dataframe:', str(student_results.shape[1]))

print('\nDisplaying number of missing data per variable name. \nThose with a large number are a potential common mistakes\nfor a variable name')

student_results.isnull().sum(axis=1)

Number of students in dataframe: 7

Displaying number of missing data per variable name. 
Those with a large number are a potential common mistakes
for a variable name


vTrain     0
xnTrain    0
xnTest     0
we         0
w          4
EAP        7
m0         1
etaNPx1    7
PDx1       7
tm         7
tv         7
ytest      7
Xtrain     4
ytrain     4
Xtest      4
xTrain     4
sTrain     4
xTest      4
m1         5
dtype: int64

In [24]:
###########################################
# EXAM DEPENDENT VARIABLE

#Dictionary with accepted mistakes in the following format
#  Expected variable name : Accepted mistake
Mistakes = {};
##########################################

for el in Mistakes:
    student_results.loc[el] = student_results.loc[el].fillna(student_results.loc[Mistakes[el]])
    
for el in student_results.index.tolist():
    if el not in truenames:
        student_results.drop(el, inplace=True)
        
student_results[student_results.columns[:3]]

Unnamed: 0,LUIS ANTHONY SANTIVAÑEZ CALDAS,MATTHEW MOORCROFT,EDUARDO HERRERA ARRUTI
vTrain,"[[2.84267180525, 3.29710073682, 3.55345020725,...",[[4.86143219825]],[[6.19564562001]]
xnTrain,"[[0.389333222624, 0.518440394855, -0.439157914...","[[0.268863058695, 0.624357411398, 0.3029224977...","[[-1.70851177173, -2.36542590894, -3.525578104..."
xnTest,"[[0.489966246279, 0.580523495113, -0.087611225...","[[0.000181359421271, -1.15992919585, -0.470657...","[[-1.71696152202, -1.46488873093, -4.482509852..."
we,"[[0.675936758708], [0.276316396782], [0.347156...","[[-0.0811007827235], [0.451596275303], [0.0320...","[[-0.0931365877392], [0.0529499569442], [0.298..."
w,"[[-0.20381506644], [0.675936758708], [0.276316...",,
EAP,,,
m0,[[1.35747384376]],,"[[1.2110683868, 1.15174695558, 1.12668119413, ..."
etaNPx1,,,
PDx1,,,
tm,,,


### 2.3. Name to NIA dictionary

Finally, since datafiles are created by NIA and results are available per student name, we need to create a dictionary connecting them

In [18]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_file = 'lista_clase_65.xlsx'
###########################################

student_NIA_names = pd.read_excel(excel_file)

#UTF-8 encoding of everything
for fld in student_NIA_names.keys():
    if fld != 'NIU':
        student_NIA_names[fld] = student_NIA_names[fld].str.encode('utf8')

NIA_name = {}

for el in student_results.columns.tolist():

    sim_list = []

    for idx,NIA in enumerate(student_NIA_names['NIU'].values):
        std_name = str(student_NIA_names['First name'].values.tolist()[idx]) + ' ' + \
                        str(student_NIA_names['Surname'].values.tolist()[idx])
        sim_list.append(difflib.SequenceMatcher(a=el.lower(), b=std_name.lower()).ratio())
    
    max_sim = max(sim_list)
    max_idx = sim_list.index(max_sim)
    
    NIA_name[student_NIA_names['NIU'].values.tolist()[max_idx]] = el
    
#Create name to NIA dictionary
name_NIA = {NIA_name[el]: el for el in NIA_name}


In [8]:
#print name_NIA
#print NIA_name

At this point we have:

   * student_data: dataframe with data given to the students. Each index is a variable, and each column a NIA
   * student_results: dataframe with student results. Each index is a variable, and each column a name
   * NIA_name: NIA to name dictionary
   * name_NIA: name to NIA dictionary

## 3. Exam solution

In this section we implement the solution to the exam. This is a function that takes the variables generated for a given student and the answers provided by the student, and generates a structure with all posible answers, possibly with a penalty term associated to each answer.

In [20]:
print (NIA_name)

{100346595: 'LUIS ANTHONY SANTIVAÑEZ CALDAS', 100346250: 'MATTHEW MOORCROFT', 100346579: 'EDUARDO HERRERA ARRUTI', 100318675: 'BEATRIZ GARCÍA SANZ', 100315121: 'FELIPE BARBOSA MARTIN', 100381208: 'DYLAN LEO PETERS', 100329922: 'DIEGO PENROZ VALENZUELA'}


In [29]:
print(student_data['100346250'])
print(student_results['MATTHEW MOORCROFT'])

xTrain    [[-0.695192411948, 1.25069358417, 1.0595791902...
sTrain    [[1.41965774357], [0.975348309444], [-0.656562...
xTest     [[-1.18513781942, -1.79553351843, -0.491849948...
Xtrain    [[0.59984956259, 0.542801149987, 3.54022122173...
ytrain    [[1], [0], [1], [0], [1], [1], [0], [1], [0], ...
Xtest     [[0.856360891895, 1.68968709553, -1.1590288644...
Name: 100346250, dtype: object
vTrain                                     [[4.86143219825]]
xnTrain    [[0.268863058695, 0.624357411398, 0.3029224977...
xnTest     [[0.000181359421271, -1.15992919585, -0.470657...
we         [[-0.0811007827235], [0.451596275303], [0.0320...
w                                                        NaN
EAP                                                      NaN
m0                                                       NaN
etaNPx1                                                  NaN
PDx1                                                     NaN
tm                                                       NaN

In [25]:
def SolveLabXX(data, st_solution):
    """Solver for the practical
    Input parameters:
    data: A series with the data given to the student
    st_solution: The solution provided by the student
    
    Output: A dataseries where each element is a list of tuples
    with the format [(solution1, factor1), (solution2, factor2)]
    
    Factors are multiplicative factors to account for possible
    penalties. A factor 1 should be given to a solution that should
    not be penalized.
    """
    
    ds_values = []
    ds_indexes = []
    
    ## Sec. 2.1
    vTrain = []
    vTrain.append((np.var(data['xTrain'], axis=0, ddof=0), 1))
    vTrain.append((np.var(data['xTrain'], axis=0, ddof=1), 1))
    
    ds_values.append(vTrain)
    ds_indexes.append('vTrain')
    
    ## Sec. 2.2
    xnTrain = []
    xnTest = []
    
    mean = np.mean(data['xTrain'], axis=0)
    xnTrain.append(((data['xTrain'] - mean) / np.sqrt(vTrain[0][0]), 1))
    xnTrain.append(((data['xTrain'] - mean) / np.sqrt(vTrain[1][0]), 1))
    
    xnTest.append(((data['xTest'] - mean) / np.sqrt(vTrain[0][0]), 1))
    xnTest.append(((data['xTest'] - mean) / np.sqrt(vTrain[1][0]), 1))
    
    ds_values.append(xnTrain)
    ds_values.append(xnTest)
    ds_indexes.append('xnTrain')
    ds_indexes.append('xnTest')
    
    ## Sec. 2.3 Damos por buenas tres posibilidades, las que se obtienen con cualquiera
    # de las versiones normalizadas de los datos de entrada, o la que se obtiene con
    # la matriz de datos de entrada utilizada por el estudiante (su campo xnTrain)
    #
    # La version sin sesgo se acepta con una penalización del 50%
    we = []
    ntr = data['xTrain'].shape[0]
    xnTraine = np.hstack((np.ones((ntr,1)),xnTrain[0][0]))
    we.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
    xnTraine = np.hstack((np.ones((ntr,1)),xnTrain[1][0]))
    we.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
    # Use own data if not nan, and has the right dimensions
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            xnTraine = np.hstack((np.ones((ntr,1)),st_solution['xnTrain']))
            we.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
            
    # Versions with 50% penalty
    we.append((np.linalg.lstsq(xnTrain[0][0], data['sTrain'])[0], .5))
    we.append((np.linalg.lstsq(xnTrain[1][0], data['sTrain'])[0], .5))
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            we.append((np.linalg.lstsq(st_solution['xnTrain'], data['sTrain'])[0], .5))
            
    ds_values.append(we)
    ds_indexes.append('we')
    
    #2.4
    w = []
    xnTraine = np.hstack((np.ones((ntr,1)),xnTrain[0][0]**[1, 2, 3, 4, 5]))
    w.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
    xnTraine = np.hstack((np.ones((ntr,1)),xnTrain[1][0]**[1, 2, 3, 4, 5]))
    w.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
    # Use own data if not nan, and has the right dimensions
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            xnTraine = np.hstack((np.ones((ntr,1)),st_solution['xnTrain']**[1, 2, 3, 4, 5]))
            w.append((np.linalg.lstsq(xnTraine, data['sTrain'])[0], 1))
            
    # Versions with 50% penalty
    w.append((np.linalg.lstsq(xnTrain[0][0]**[1, 2, 3, 4, 5], data['sTrain'])[0], .5))
    w.append((np.linalg.lstsq(xnTrain[1][0]**[1, 2, 3, 4, 5], data['sTrain'])[0], .5))
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            w.append((np.linalg.lstsq(st_solution['xnTrain']**[1, 2, 3, 4, 5], data['sTrain'])[0], .5))

    ds_values.append(w)
    ds_indexes.append('w')

    #2.5. Solution for this section is based on student solution.
    EAP = []
    # EAP for model of Section 2.3
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            xnTraine = np.hstack((np.ones((ntr,1)),st_solution['xnTrain']))
            EAP.append((np.mean(np.abs(xnTraine.dot(we[2][0]) - data['sTrain'])), 1))
            #If the sum instead of the average is calculated: x0.7
            EAP.append((np.sum(np.abs(xnTraine.dot(we[2][0]) - data['sTrain'])), .7))
    
    # EAP for model of Section 2.4
    if not np.all(np.isnan(st_solution['xnTrain'])):
        if np.array_equal(st_solution['xnTrain'].shape, xnTrain[0][0].shape):
            xnTraine = np.hstack((np.ones((ntr,1)),st_solution['xnTrain']**[1, 2, 3, 4, 5]))
            EAP.append((np.mean(np.abs(xnTraine.dot(w[2][0]) - data['sTrain'])), 1))
            #If the sum instead of the average is calculated: x0.7
            EAP.append((np.sum(np.abs(xnTraine.dot(w[2][0]) - data['sTrain'])), .7))
    
    ds_values.append(EAP)
    ds_indexes.append('EAP')
    
    #3.1. 
    m0 = []
    
    ind0 = np.where(data['ytrain']==0)[0]
    ind1 = np.where(data['ytrain']==1)[0]
    m0.append((np.mean(data['Xtrain'][ind0,]),1))
    #50% penalty for those using only the first column
    m0.append((np.mean(data['Xtrain'][ind0,0]),.5))
    ds_values.append(m0)
    ds_indexes.append('m0')
    
    #3.2.
    from scipy.stats import norm
    
    etaNPx1 = []
    etaNPx1.append((m0[0][0] + norm.ppf(0.9) * (2**.5), 1))
    #We admit also the value computed using the m0 provided by the student
    if not np.all(np.isnan(st_solution['m0'])):
        if np.array_equal(st_solution['m0'][0].flatten().shape, m0[0][0].flatten().shape):
            etaNPx1.append((st_solution['m0'][0] + norm.ppf(0.9) * (2**.5), 1))

    ds_values.append(etaNPx1)
    ds_indexes.append('etaNPx1')
    
    #3.3.

    qfunc = lambda x: 0.5-0.5*scipy.special.erf(x/np.sqrt(2))
    PDx1 = []
    m1a = np.mean(data['Xtrain'][ind1,])
    m1b = np.mean(data['Xtrain'][ind1,0])
    
    PDx1.append((qfunc((etaNPx1[0][0]-m1a)/(2**.5)), 1))
    #We use the threshold provided by the student and the average of m1, using either 
    # all the data, or just the first column
    if not np.all(np.isnan(st_solution['etaNPx1'])):
        if np.array_equal(st_solution['etaNPx1'].flatten().shape, (1,)):
            PDx1.append((qfunc((st_solution['etaNPx1'][0]-m1a)/(2**.5)), 1))
            PDx1.append((qfunc((st_solution['etaNPx1'][0]-m1b)/(2**.5)), 1))
    
    ds_values.append(PDx1)
    ds_indexes.append('PDx1')
    
    #3.4
    tm = []
    tv = []
    
    #Theoretical results
    ndim = data['Xtrain'].shape[1]
    tm.append(((1+np.arange(ndim)) * np.mean(data['Xtrain'][ind1,]), 1))
    tv.append(((1+np.arange(ndim)) * 2, 1))
    
    #Results computed from data
    tm.append((np.array([np.mean(np.sum(data['Xtrain'][ind1,:nvars+1], axis=1)) for nvars in range(ndim)]),1))
    tv.append((np.array([np.var(np.sum(data['Xtrain'][ind1,:nvars+1], axis=1), ddof=0) for nvars in range(ndim)]),1))
    tv.append((np.array([np.var(np.sum(data['Xtrain'][ind1,:nvars+1], axis=1), ddof=1) for nvars in range(ndim)]),1))
        
    ds_values.append(tm)
    ds_indexes.append('tm')
    ds_values.append(tv)
    ds_indexes.append('tv')
    
    #3.5
    ytest = []
    tm0 = 3 * np.mean(data['Xtrain'][ind0,])
    th = ((tm[0][0][2] + tm0)/2)
    ytest.append(((np.sum(data['Xtrain'][:,:3], axis=1)>th).astype('int'),1))
    
    tm0 = np.mean(np.sum(data['Xtrain'][ind0,:3], axis=1))
    th = ((tm[1][0][2] + tm0)/2)
    ytest.append(((np.sum(data['Xtrain'][:,:3], axis=1)>th).astype('int'),1))
    
    ds_values.append(ytest)
    ds_indexes.append('ytest')
    
    return pd.Series(ds_values, ds_indexes)
    

## 4. Evaluation of all students

In [26]:
def automatic_evaluator(student_results, solution, weights, tolerance):
    if len(solution.keys())==len(weights) and len(solution.keys())==len(tolerance):
        val = []
        idx = []
        for wgh,el,tol in zip(weights,solution.keys(),tolerance):
            var_summary = []
            #If the student has delivered the variable, append 1; otherwise 0
            if not np.all(np.isnan(student_results[el])):
                var_summary.append(1)
                #Check all possible solutions against the one provided by the student
                factors = [entry[1] for entry in solution[el]
                       if np.array_equal(student_results[el].flatten().shape, entry[0].flatten().shape)
                       and np.mean(np.abs(entry[0].flatten()-student_results[el].flatten()))<tol]
                
                if len(factors):
                    max_factor = max(factors)
                    var_summary.extend([1, max_factor, wgh, max_factor*wgh])
                else:
                    var_summary.extend([0, 0, wgh, 0])
            else:
                var_summary.extend([0, 0, 0, wgh, 0])
            #Keep values corresponding to current variable
            val.append(var_summary)
            idx.append(el)
        final_score = sum([item[-1] for item in val])
        val.append(final_score)
        idx.append('FinalScore')
        return pd.Series(val,index=idx)
    else:
        print('The number of weights and variables to evaluate differ. Please, check')
        return []

In [28]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_output = 'Notas_65_Lab12.xlsx'
weights = [1, .5, .5, 1, 1, 1, 1, 1, 1, .5, .5, 1]
tolerance = [1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2]
###########################################

df = pd.DataFrame()

for NIA in NIA_name.keys():
    solution = SolveLabXX(student_data[str(NIA)], student_results[NIA_name[NIA]])
    df[NIA_name[NIA]] = automatic_evaluator(student_results[NIA_name[NIA]], solution, weights, tolerance)

df.T.to_excel(excel_output,columns=df.T.columns)