# Automatic Lab Evaluator

## Assessment based on student-provided results

Version History:

Version 0.1 - Jerónimo Arenas García, Jesús Cid Sueiro, Vanessa Gómez Verdejo, Dec. 2016


In [None]:
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
import scipy.io as sio
import scipy
import zipfile as zp
import shutil
import difflib

## 1. Read datafiles for all students

Student datafiles can be in any of the following formats:

   * `'.zip'`: When uncompressed, the zip may contain one or several matlab files. All matlab files are read and incorporated to a pandas Dataframe where each student is a column, and each index is a variable available for the exam solution
   * `.mat'`: All data variables for the students are given in a single matlab file

In [None]:
def readdatafiles(datafiles_path, splitsymbol):

    temporary_dir = './tmp'
    df = pd.DataFrame()
    
    datafiles = [f for f in os.listdir(datafiles_path) if isfile(join(datafiles_path, f))]
    for dtfile in datafiles:
        if dtfile.endswith('zip'):
            #All files will be extracted and the contents incorporated to the Dataframe
            NIA = dtfile.split(splitsymbol)[0]
            idx = []
            val = []
            zpobj = zp.ZipFile(join(datafiles_path, dtfile))
            for fileinzip in zpobj.namelist():
                if fileinzip.endswith('mat'):
                    #Matlab files are extracted to a temporal subfolder
                    zpobj.extract(fileinzip, temporary_dir)
                    data = sio.loadmat(join(temporary_dir,fileinzip))
                    #Read all variable names and the corresponding data values
                    for var in [el for el in data.keys() if not el.startswith('_')]:
                        idx.append(var)
                        val.append(data[var])
            #If
            if idx:
                df[NIA] = pd.Series(val,index=idx)
                
            #Remove temporary directory, if it has been created
            if os.path.exists(temporary_dir):
                shutil.rmtree(temporary_dir)
                    
        elif dtfile.endswith('mat'):
            NIA = dtfile.split(splitsymbol)[0]
            print NIA
            idx = []
            val = []
            data = sio.loadmat(join(datafiles_path, dtfile))
            #Read all variable names and the corresponding data values
            for var in [el for el in data.keys() if not el.startswith('_')]:
                idx.append(var)
                val.append(data[var])
                
            if idx:
                df[NIA] = pd.Series(val,index=idx)
                
    return df
        

In [None]:
###########################################
# EXAM DEPENDENT VARIABLE
datafiles_path = './datafiles_Gbil/'
##########################################

student_data = readdatafiles(datafiles_path, splitsymbol='.')

print 'Number of students in dataframe: ' + str(student_data.shape[1])
print 'Number of variables read: ' + str(student_data.shape[0])

print 'Displaying data for first three students ... '
student_data[student_data.columns[:3]]

## 2. Read answers provided by students


### 2.1. Read student results into panda dataframe

In [None]:
###########################################
# EXAM DEPENDENT VARIABLE
results_path = './entregas_GBil/'
#Requested variable names in the wording of the exam
truenames = ['sMSE', 'vx', 'sMSE0', 's25', 'SSE']
###########################################

student_results = readdatafiles(results_path, splitsymbol='_')
newindex = truenames+[el for el in student_results.index.tolist() if el not in truenames]
student_results = student_results.reindex(newindex)

print 'Number of students in dataframe: ' + str(student_results.shape[1])
print 'Number of variables read: ' + str(student_results.shape[0])

print 'Displaying data for first three students ... '
student_results[student_results.columns[:3]]

### 2.2. Common Mistakes on variable names

In view of all variable names provided by all students, we may decide to allow alternative names for variables without any penalty

In [None]:
print 'Number of students in dataframe: ' + str(student_results.shape[1])

print '\nDisplaying number of missing data per variable name. \nThose with a large number are a potential common mistakes\nfor a variable name'

student_results.isnull().sum(axis=1)

In [None]:
###########################################
# EXAM DEPENDENT VARIABLE

#Dictionary with accepted mistakes in the following format
#  Expected variable name : Accepted mistake
Mistakes = {};
##########################################

for el in Mistakes:
    student_results.loc[el] = student_results.loc[el].fillna(student_results.loc[Mistakes[el]])
    
for el in student_results.index.tolist():
    if el not in truenames:
        student_results.drop(el, inplace=True)
        
student_results[student_results.columns[:3]]

### 2.3. Name to NIA dictionary

Finally, since datafiles are created by NIA and results are available per student name, we need to create a dictionary connecting them

In [None]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_file = 'lista_clase_65.xlsx'
language = 'english'
###########################################

student_NIA_names = pd.read_excel(excel_file)

#UTF-8 encoding of everything
for fld in student_NIA_names.keys():
    if fld != 'NIU':
        student_NIA_names[fld] = student_NIA_names[fld].str.encode('utf8')

NIA_name = {}

for el in student_results.columns.tolist():

    sim_list = []

    for idx,NIA in enumerate(student_NIA_names['NIU'].values):
    
        if language=='english':
            std_name = student_NIA_names['First name'].values.tolist()[idx] + ' ' + \
                            student_NIA_names['Surname'].values.tolist()[idx]
            sim_list.append(difflib.SequenceMatcher(a=el.lower(), b=std_name.lower()).ratio())
        else:
            std_name = student_NIA_names['Nombre'].values.tolist()[idx] + ' ' + \
                            student_NIA_names['Apellido(s)'].values.tolist()[idx]
            sim_list.append(difflib.SequenceMatcher(a=el.lower(), b=std_name.lower()).ratio())
    
    max_sim = max(sim_list)
    max_idx = sim_list.index(max_sim)
    
    NIA_name[student_NIA_names['NIU'].values.tolist()[max_idx]] = el
    
#Create name to NIA dictionary
name_NIA = {NIA_name[el]: el for el in NIA_name}


In [None]:
print name_NIA
#print NIA_name
#for el in NIA_name.keys():
#    print str(el) + ' : ' + NIA_name[el]


At this point we have:

   * student_data: dataframe with data given to the students. Each index is a variable, and each column a NIA
   * student_results: dataframe with student results. Each index is a variable, and each column a name
   * NIA_name: NIA to name dictionary
   * name_NIA: name to NIA dictionary

## 3. Exam solution

In this section we implement the solution to the exam. This is a function that takes the variables generated for a given student and the answers provided by the student, and generates a structure with all posible answers, possibly with a penalty term associated to each answer.

In [None]:
#print NIA_name

In [None]:
student = 
print student_data[str(name_NIA[student])]
print student_results[student]


In [None]:
def SolveLabXX(data, st_solution):
    """Solver for the practical
    Input parameters:
    data: A series with the data given to the student
    st_solution: The solution provided by the student
    
    Output: A dataseries where each element is a list of tuples
    with the format [(solution1, factor1), (solution2, factor2)]
    
    Factors are multiplicative factors to account for possible
    penalties. A factor 1 should be given to a solution that should
    not be penalized.
    """
    
    ds_values = []
    ds_indexes = []
    
    ## Sec. 2.1
    sMSE = []
    
    c = np.concatenate((data['u'][0],np.zeros((data['M']-1,))))
    U = scipy.linalg.toeplitz(c,data['u'])
    P = np.linalg.inv((U.dot(U.T) + (data['varN']/data['varS'])*np.eye(data['M'])))
    
    sMSE.append((P.dot(U).dot(data['x']),1))
    
    ds_values.append(sMSE)
    ds_indexes.append('sMSE')
    
    ## Sec. 2.2
    vx = []
    u_star = np.concatenate(([0], np.flipud(data['u'][(-1*data['M']+1)::]).flatten())).reshape((data['M'],1))
    vx.append((data['varN'] + data['varN']* u_star.T.dot(P).dot(u_star), 1))
    
    ds_values.append(vx)
    ds_indexes.append('vx')
    
    ## Sec. 2.3
    sMSE0 = []
    U2 = np.delete(U, 2, axis=0)
    P2 = np.linalg.inv((U2.dot(U2.T) + (data['varN']/data['varS'])*np.eye(data['M']-1)))
    sMSE2 = P2.dot(U2).dot(data['x'])
    sMSE0.append((np.insert(sMSE2, 2, [0], axis=0),1))
    
    ds_values.append(sMSE0)
    ds_indexes.append('sMSE0')
    
    ## Sec. 2.4
    s25 = []
    
    s = np.zeros((data['M'],))
    mu = .005
    
    for iter in np.arange(25):
        u_iter = U[:,iter]
        x_pred = u_iter.dot(s)
        x_iter = data['x'][iter]
        s = s + mu * (x_iter - x_pred) * u_iter 
    
    s25.append((s,1))
    
    ds_values.append(s25)
    ds_indexes.append('s25')
    
    ## Sec. 2.5
    
    SSE = []
    SSE.append((np.sum((data['x'].flatten() - U.T.dot(s))**2),1))
    # Allow student solution
    if not np.all(np.isnan(st_solution['s25'])):
        st_s = st_solution['s25'].flatten()
        if np.array_equal(s.shape, st_s.shape):
            SSE.append((np.sum((data['x'].flatten() - U.T.dot(st_s))**2),1))
            
    ds_values.append(SSE)
    ds_indexes.append('SSE')
    
    return pd.Series(ds_values, ds_indexes)
    

In [None]:
print student_results[student]
print SolveLabXX(student_data[str(name_NIA[student])], student_results[student])

## 4. Evaluation of all students

In [None]:
def automatic_evaluator(student_results, solution, weights, tolerance):
    if len(solution.keys())==len(weights) and len(solution.keys())==len(tolerance):
        val = []
        idx = []
        for wgh,el,tol in zip(weights,solution.keys(),tolerance):
            var_summary = []
            #If the student has delivered the variable, append 1; otherwise 0
            if not np.all(np.isnan(student_results[el])):
                var_summary.append(1)
                #Check all possible solutions against the one provided by the student
                factors = [entry[1] for entry in solution[el]
                       if np.array_equal(student_results[el].flatten().shape, entry[0].flatten().shape)
                       and np.mean(np.abs(entry[0].flatten()-student_results[el].flatten()))<tol]
                
                if len(factors):
                    max_factor = max(factors)
                    var_summary.extend([1, max_factor, wgh, max_factor*wgh])
                else:
                    var_summary.extend([0, 0, wgh, 0])
            else:
                var_summary.extend([0, 0, 0, wgh, 0])
            #Keep values corresponding to current variable
            val.append(var_summary)
            idx.append(el)
        final_score = sum([item[-1] for item in val])
        val.append(final_score)
        idx.append('FinalScore')
        return pd.Series(val,index=idx)
    else:
        print 'The number of weights and variables to evaluate differ. Please, check'
        return []

In [None]:
###########################################
# EXAM DEPENDENT VARIABLE
excel_output = 'Notas_65_Lab3.xlsx'
weights = [3, 2, 1, 3, 1]
tolerance = [1e-2, 1e-2, 1e-2, 1e-2, 1e-2]
###########################################

df = pd.DataFrame()

for NIA in NIA_name.keys():
    print NIA_name[NIA]
    solution = SolveLabXX(student_data[str(NIA)], student_results[NIA_name[NIA]])
    df[NIA_name[NIA].decode('utf8')] = automatic_evaluator(student_results[NIA_name[NIA]], solution, weights, tolerance)

print df
    
df.T.to_excel(excel_output,columns=df.T.columns)

In [None]:
NIA_name
