# Automatic Lab Evaluator

## Assessment based on student-provided results

Version History:

Version 0.1 - Jerónimo Arenas García, Dec. 2016

In [107]:
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
import scipy.io as sio
import scipy
import zipfile as zp
import shutil
import difflib

## 1. Read datafiles for all students

Student datafiles can be in any of the following formats:

   * `'.zip'`: When uncompressed, the zip may contain one or several matlab files. All matlab files are read and incorporated to a pandas Dataframe where each student is a column, and each index is a variable available for the exam solution
   * `.mat'`: All data variables for the students are given in a single matlab file

In [108]:
def readdatafiles(datafiles_path, splitsymbol):

    temporary_dir = './tmp'
    df = pd.DataFrame()
    
    datafiles = [f for f in os.listdir(datafiles_path) if isfile(join(datafiles_path, f))]
    for dtfile in datafiles:
        if dtfile.endswith('zip'):
            #All files will be extracted and the contents incorporated to the Dataframe
            NIA = dtfile.split(splitsymbol)[0]
            idx = []
            val = []
            zpobj = zp.ZipFile(join(datafiles_path, dtfile))
            for fileinzip in zpobj.namelist():
                if fileinzip.endswith('mat'):
                    #Matlab files are extracted to a temporal subfolder
                    zpobj.extract(fileinzip, temporary_dir)
                    data = sio.loadmat(join(temporary_dir,fileinzip))
                    #Read all variable names and the corresponding data values
                    for var in [el for el in data.keys() if not el.startswith('_')]:
                        idx.append(var)
                        val.append(data[var])
            #If
            if idx:
                df[NIA] = pd.Series(val,index=idx)
                
            #Remove temporary directory, if it has been created
            if os.path.exists(temporary_dir):
                shutil.rmtree(temporary_dir)
                    
        elif dtfile.endswith('mat'):
            NIA = dtfile.split(splitsymbol)[0]
            print NIA
            idx = []
            val = []
            data = sio.loadmat(join(datafiles_path, dtfile))
            #Read all variable names and the corresponding data values
            for var in [el for el in data.keys() if not el.startswith('_')]:
                idx.append(var)
                val.append(data[var])
                
            if idx:
                df[NIA] = pd.Series(val,index=idx)
                
    return df
        

In [109]:
###########################################
# EXAM DEPENDENT VARIABLE
datafiles_path = './datafiles_GSCGT/'
##########################################

student_data = readdatafiles(datafiles_path, splitsymbol='.')

print 'Number of students in dataframe: ' + str(student_data.shape[1])
print 'Number of variables read: ' + str(student_data.shape[0])

print 'Displaying data for first three students ... '
student_data[student_data.columns[:3]]

100071426
100072667
100073324
100081450
100277493
100277830
100282449
100282452
100282739
100282920
100282957
100283774
100283839
100284246
100291206
100291362
100291482
100291551
100291909
100292117
100292248
100292807
100293005
100293069
100293422
100294122
100303492
100303510
100304350
100304352
100304387
100304703
100305285
100305296
100305449
100305467
100305494
100305662
100305941
100305996
100306582
100307094
100307156
100314975
100315022
100315119
100315175
100315221
100315226
100315432
100315702
100315961
100316764
100316810
100317102
100317477
100317556
100317575
100317691
100317711
100317893
100318055
100318329
100318673
100323875
100329940
100329964
100329986
100330048
100330055
100330076
100330128
100330166
100330173
100330255
100330256
100330306
100330328
100330383
100330408
100330428
100330444
100330595
100330642
100330647
100330729
100333500
100346684
Number of students in dataframe: 88
Number of variables read: 5
Displaying data for first three students ... 


Unnamed: 0,100071426,100072667,100073324
varN,[[0.037]],[[0.0242]],[[0.0314]]
varS,[[0.0227272727273]],[[0.02]],[[0.0333333333333]]
M,[[22]],[[25]],[[15]]
u,"[[1.23508960507], [0.18961120133], [1.18570044...","[[-0.0715909200974], [-0.222215208027], [-1.52...","[[0.89248240396], [0.0132496956861], [0.036566..."
x,"[[0.138602137163], [-0.214882807689], [-0.1692...","[[-0.260551353858], [0.0196293767197], [-0.053...","[[0.0404643284628], [0.250183245138], [-0.1051..."


## 2. Read answers provided by students


### 2.1. Read student results into panda dataframe

In [110]:
###########################################
# EXAM DEPENDENT VARIABLE
results_path = './entregas_GSCGT/'
#Requested variable names in the wording of the exam
truenames = ['sMSE', 'vMSE', 'pxN', 'y', 'sMSEd10', 's25']
###########################################

student_results = readdatafiles(results_path, splitsymbol='_')
newindex = truenames+[el for el in student_results.index.tolist() if el not in truenames]
student_results = student_results.reindex(newindex)

print 'Number of students in dataframe: ' + str(student_results.shape[1])
print 'Number of variables read: ' + str(student_results.shape[0])

print 'Displaying data for first three students ... '
student_results[student_results.columns[:3]]

ADRIAN CRUZ SACEDO
ALBA ALCOLEA LOPEZ
ALBERTO CASADO MEDINA
ALEJANDRO RAMIRO MUÑOZ
ALEJANDRO RODRIGUEZ ORTIZ
ALVARO DE LOS REYES GUIO
ALVARO VARELA CACHARRO
ANA FERNANDEZ SANTOS
BORJA RODRIGUEZ MARTINEZ
CARLOS MARTIN-ROMO BARRILERO
CRISTINA BAOS PIQUERAS
DANIEL AGUILERA MARTINEZ
DANIEL BELLIDO CASTILLO
DANIEL DAVID DIAZ CASTILLO
DANIEL MADRID POVEDA
DAVID GOMEZ SANCHO
ESTEFANIA FUENTES FERNANDEZ
GABRIEL DIAZ DEL CAMPO CLEMENTE
GABRIEL GARCIA SANCHEZ
GABRIEL RETANA DIAZ
GALAN YUDIE HARDJONO
IGNACIO DE DIEGO PEREZ-CAMESELLE
IGNACIO FERNANDEZ LIZAUR
IRATXE GONZALEZ FERNANDEZ
IVAN LOPEZ PACHECO
JAVIER BAUTISTA RAMOS
JAVIER BENAVIDES VAZQUEZ
JAVIER BLASCO CASADO
JAVIER GARCIA MUÑOZ
JAVIER RUIZ RUIZ
JORGE SAUCEDO PAVON
JORGE TORRES MARTINEZ DE BUJO
JOSE RODRIGO FUENTES RAMIREZ
JOSE ROMERO SOLIS
LAURA PERAL ARANA
MADRU JESAJA KORTZ
MARIA PEÑA GUTIERREZ
MARIO ALONSO SANCHEZ
MARIO MUÑOZ PRIETO
MIGUEL ORLANDO GARCIA PALACIOS
NURIA ABANADES MUÑOZ
NURIA PORTAL CARRASQUILLA
NURIA RODRIGUEZ JIM

Unnamed: 0,ADRIAN BAILON PEREZ,ADRIAN CRUZ SACEDO,ALBA ALCOLEA LOPEZ
sMSE,"[[0.0309912994484], [0.00197022300655], [0.032...","[[-0.0231755204919], [-0.0268868344051], [-0.0...","[[-0.0915508512004], [-0.0388615610752], [-0.0..."
vMSE,"[[7.42220973283e-05, 1.40836152365e-07, -3.241...","[[9.66597185773e-05, -2.90517427876e-06, -4.17...","[[7.26256644298e-05, -1.77426777256e-06, 1.279..."
pxN,[[0]],[[-0.0156388915157]],[[0.3]]
y,"[[-0.159077255545], [-0.0875577214942], [0.060...","[[-0.119345539632], [-0.0900499782051], [-0.07...","[[-0.022078584765], [0.0328973899684], [-0.000..."
sMSEd10,"[[0.054360550989], [0.0256246781021], [-0.0090...","[[-0.000572818068456], [-0.00057281806846], [-...","[[-0.0576287773723], [0.0408321786972], [0.043..."
s25,"[[0.00663646355502], [0.00611309577677], [0.01...",[[0.0402595380701]],[[-0.00921961261803]]


### 2.2. Common Mistakes on variable names

In view of all variable names provided by all students, we may decide to allow alternative names for variables without any penalty

In [111]:
print 'Number of students in dataframe: ' + str(student_results.shape[1])

print '\nDisplaying number of missing data per variable name. \nThose with a large number are a potential common mistakes\nfor a variable name'

student_results.isnull().sum(axis=1)

Number of students in dataframe: 57

Displaying number of missing data per variable name. 
Those with a large number are a potential common mistakes
for a variable name


sMSE        1
vMSE        2
pxN        18
y          27
sMSEd10    14
s25        11
dtype: int64

In [112]:
###########################################
# EXAM DEPENDENT VARIABLE

#Dictionary with accepted mistakes in the following format
#  Expected variable name : Accepted mistake
Mistakes = {};
##########################################

for el in Mistakes:
    student_results.loc[el] = student_results.loc[el].fillna(student_results.loc[Mistakes[el]])
    
for el in student_results.index.tolist():
    if el not in truenames:
        student_results.drop(el, inplace=True)
        
student_results[student_results.columns[:3]]

Unnamed: 0,ADRIAN BAILON PEREZ,ADRIAN CRUZ SACEDO,ALBA ALCOLEA LOPEZ
sMSE,"[[0.0309912994484], [0.00197022300655], [0.032...","[[-0.0231755204919], [-0.0268868344051], [-0.0...","[[-0.0915508512004], [-0.0388615610752], [-0.0..."
vMSE,"[[7.42220973283e-05, 1.40836152365e-07, -3.241...","[[9.66597185773e-05, -2.90517427876e-06, -4.17...","[[7.26256644298e-05, -1.77426777256e-06, 1.279..."
pxN,[[0]],[[-0.0156388915157]],[[0.3]]
y,"[[-0.159077255545], [-0.0875577214942], [0.060...","[[-0.119345539632], [-0.0900499782051], [-0.07...","[[-0.022078584765], [0.0328973899684], [-0.000..."
sMSEd10,"[[0.054360550989], [0.0256246781021], [-0.0090...","[[-0.000572818068456], [-0.00057281806846], [-...","[[-0.0576287773723], [0.0408321786972], [0.043..."
s25,"[[0.00663646355502], [0.00611309577677], [0.01...",[[0.0402595380701]],[[-0.00921961261803]]


### 2.3. Name to NIA dictionary

Finally, since datafiles are created by NIA and results are available per student name, we need to create a dictionary connecting them

In [113]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_file = 'lista_clase_61.xlsx'
language = 'spanish'
###########################################

student_NIA_names = pd.read_excel(excel_file)

#UTF-8 encoding of everything
for fld in student_NIA_names.keys():
    if fld != 'NIU':
        student_NIA_names[fld] = student_NIA_names[fld].str.encode('utf8')

NIA_name = {}

for el in student_results.columns.tolist():

    sim_list = []

    for idx,NIA in enumerate(student_NIA_names['NIU'].values):
    
        if language=='english':
            std_name = student_NIA_names['First name'].values.tolist()[idx] + ' ' + \
                            student_NIA_names['Surname'].values.tolist()[idx]
            sim_list.append(difflib.SequenceMatcher(a=el.lower(), b=std_name.lower()).ratio())
        else:
            std_name = student_NIA_names['Nombre'].values.tolist()[idx] + ' ' + \
                            student_NIA_names['Apellido(s)'].values.tolist()[idx]
            sim_list.append(difflib.SequenceMatcher(a=el.lower(), b=std_name.lower()).ratio())
    
    max_sim = max(sim_list)
    max_idx = sim_list.index(max_sim)
    
    NIA_name[student_NIA_names['NIU'].values.tolist()[max_idx]] = el
    
#Create name to NIA dictionary
name_NIA = {NIA_name[el]: el for el in NIA_name}


In [114]:
#print name_NIA
#print NIA_name
#for el in NIA_name.keys():
#    print str(el) + ' : ' + NIA_name[el]


At this point we have:

   * student_data: dataframe with data given to the students. Each index is a variable, and each column a NIA
   * student_results: dataframe with student results. Each index is a variable, and each column a name
   * NIA_name: NIA to name dictionary
   * name_NIA: name to NIA dictionary

## 3. Exam solution

In this section we implement the solution to the exam. This is a function that takes the variables generated for a given student and the answers provided by the student, and generates a structure with all posible answers, possibly with a penalty term associated to each answer.

In [115]:
#print NIA_name

In [116]:
student = 'JAVIER BENAVIDES VAZQUEZ'
print student_data[str(name_NIA[student])]
print student_results[student]


varN                                           [[0.0262]]
varS                                  [[0.0178571428571]]
M                                                  [[28]]
u       [[-1.26419670938], [0.542418071809], [0.240003...
x       [[-0.214362172896], [0.178885947669], [-0.2702...
Name: 100315119, dtype: object
sMSE       [[0.0309912994484], [0.00197022300655], [0.032...
vMSE       [[7.42220973283e-05, 1.40836152365e-07, -3.241...
pxN                                       [[0.765462326801]]
y          [[-0.214362172896], [0.177655965939], [-0.3292...
sMSEd10    [[0.00606879566238], [-0.0750495644057], [0.08...
s25        [[0.00812775834139], [0.00782324708796], [0.01...
Name: JAVIER BENAVIDES VAZQUEZ, dtype: object


In [117]:
def SolveLabXX(data, st_solution):
    """Solver for the practical
    Input parameters:
    data: A series with the data given to the student
    st_solution: The solution provided by the student
    
    Output: A dataseries where each element is a list of tuples
    with the format [(solution1, factor1), (solution2, factor2)]
    
    Factors are multiplicative factors to account for possible
    penalties. A factor 1 should be given to a solution that should
    not be penalized.
    """
    
    ds_values = []
    ds_indexes = []
    
    ## Sec. 2.1
    sMSE = []
    
    c = np.concatenate((data['u'][0],np.zeros((data['M']-1,))))
    U = scipy.linalg.toeplitz(c,data['u'])
    P = np.linalg.inv((U.dot(U.T) + (data['varN']/data['varS'])*np.eye(data['M'])))
    
    sMSE.append((P.dot(U).dot(data['x']),1))
    
    ds_values.append(sMSE)
    ds_indexes.append('sMSE')

    ## Sec. 2.2
    vMSE = []
    
    vMSE.append((data['varN']*P,1))
    
    ds_values.append(vMSE)
    ds_indexes.append('vMSE')
        
    
    ## Sec. 2.3
    from scipy.stats import norm
    
    pxN = []
    u_star = np.concatenate(([0], np.flipud(data['u'][(-1*data['M']+1)::]).flatten())).reshape((data['M'],1))
    vx = data['varN'] + data['varN']* u_star.T.dot(P).dot(u_star)
    mx = u_star.T.dot(sMSE[0][0])
    
    pxN.append((norm.pdf(.3, loc=mx, scale=np.sqrt(vx)),1))
    
    ds_values.append(pxN)
    ds_indexes.append('pxN')
    
    ## Sec. 2.4
    y = []
    y.append((data['x'][::10],1))
    
    ds_values.append(y)
    ds_indexes.append('y')
    
    sMSEd10 = []
    U10 = U[:,::10]
    P10 = np.linalg.inv((U10.dot(U10.T) + (data['varN']/data['varS'])*np.eye(data['M'])))
    
    sMSEd10.append((P10.dot(U10).dot(y[0][0]),1))
    
    ds_values.append(sMSEd10)
    ds_indexes.append('sMSEd10')

    
    ## Sec. 2.5
    s25 = []
    
    s = np.zeros((data['M'],))
    mu = .005
    
    for iter in np.arange(25):
        u_iter = U[:,iter]
        x_pred = u_iter.dot(s)
        x_iter = data['x'][iter]
        s = s + mu * (x_iter - x_pred) * u_iter 
    
    s25.append((s,1))
    
    ds_values.append(s25)
    ds_indexes.append('s25')

    
    return pd.Series(ds_values, ds_indexes)
    

In [118]:
print student_results[student]
print SolveLabXX(student_data[str(name_NIA[student])], student_results[student])

sMSE       [[0.0309912994484], [0.00197022300655], [0.032...
vMSE       [[7.42220973283e-05, 1.40836152365e-07, -3.241...
pxN                                       [[0.765462326801]]
y          [[-0.214362172896], [0.177655965939], [-0.3292...
sMSEd10    [[0.00606879566238], [-0.0750495644057], [0.08...
s25        [[0.00812775834139], [0.00782324708796], [0.01...
Name: JAVIER BENAVIDES VAZQUEZ, dtype: object
sMSE       [([[ 0.0309913], [ 0.00197022], [ 0.03209901],...
vMSE       [([[  7.42220973e-05   1.40836152e-07  -3.2416...
pxN                                   [([[ 0.76546233]], 1)]
y          [([[-0.21436217], [ 0.17765597], [-0.32920055]...
sMSEd10    [([[ 0.03338479], [ 0.04687121], [-0.00978931]...
s25        [([0.00663646355502, 0.00611309577677, 0.01464...
dtype: object




## 4. Evaluation of all students

In [119]:
def automatic_evaluator(student_results, solution, weights, tolerance):
    if len(solution.keys())==len(weights) and len(solution.keys())==len(tolerance):
        val = []
        idx = []
        for wgh,el,tol in zip(weights,solution.keys(),tolerance):
            var_summary = []
            #If the student has delivered the variable, append 1; otherwise 0
            if not np.all(np.isnan(student_results[el])):
                var_summary.append(1)
                #Check all possible solutions against the one provided by the student
                factors = [entry[1] for entry in solution[el]
                       if np.array_equal(student_results[el].flatten().shape, entry[0].flatten().shape)
                       and np.mean(np.abs(entry[0].flatten()-student_results[el].flatten()))<tol]
                
                if len(factors):
                    max_factor = max(factors)
                    var_summary.extend([1, max_factor, wgh, max_factor*wgh])
                else:
                    var_summary.extend([0, 0, wgh, 0])
            else:
                var_summary.extend([0, 0, 0, wgh, 0])
            #Keep values corresponding to current variable
            val.append(var_summary)
            idx.append(el)
        final_score = sum([item[-1] for item in val])
        val.append(final_score)
        idx.append('FinalScore')
        return pd.Series(val,index=idx)
    else:
        print 'The number of weights and variables to evaluate differ. Please, check'
        return []

In [120]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_output = 'Notas_61_Lab3.xlsx'
weights = [3, 2, 1, .5, .5, 3]
tolerance = [1e-2, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2]
###########################################

df = pd.DataFrame()

for NIA in NIA_name.keys():
    #print NIA_name[NIA]
    solution = SolveLabXX(student_data[str(NIA)], student_results[NIA_name[NIA]])
    df[NIA_name[NIA].decode('utf8')] = automatic_evaluator(student_results[NIA_name[NIA]], solution, weights, tolerance)

print df
    
df.T.to_excel(excel_output,columns=df.T.columns)

           MIGUEL ORLANDO GARCIA PALACIOS CARLOS MARTIN-ROMO BARRILERO  \
sMSE                      [1, 1, 1, 3, 3]              [1, 0, 0, 3, 0]   
vMSE                      [1, 1, 1, 2, 2]              [1, 0, 0, 2, 0]   
pxN                       [1, 0, 0, 1, 0]              [0, 0, 0, 1, 0]   
y                       [0, 0, 0, 0.5, 0]            [0, 0, 0, 0.5, 0]   
sMSEd10                 [1, 0, 0, 0.5, 0]            [0, 0, 0, 0.5, 0]   
s25                       [1, 0, 0, 3, 0]              [0, 0, 0, 3, 0]   
FinalScore                              5                            0   

             JAVIER RUIZ RUIZ SANDRA MARIA GARCIA-SIÑERIZ HERRADOR  \
sMSE          [1, 0, 0, 3, 0]                       [1, 0, 0, 3, 0]   
vMSE          [1, 0, 0, 2, 0]                       [1, 0, 0, 2, 0]   
pxN           [0, 0, 0, 1, 0]                       [1, 0, 0, 1, 0]   
y           [0, 0, 0, 0.5, 0]                     [1, 0, 0, 0.5, 0]   
sMSEd10     [0, 0, 0, 0.5, 0]                     [1

