In [22]:
# Helpers
import pickle
import numpy as np
import time
from collections import defaultdict
import warnings

# DB 
import psycopg2
from django.conf import settings

# Learning
from sklearn.decomposition import NMF

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [24]:
folder = "data/edm2020"
N_RUNS = 5

### Organizing data

In [25]:
def add_to_matrix(data, tensor, X, attempt_agg='avg', filter_attempt=False):
    attempts_count = defaultdict(int)
    for item in data:
        s_idx, q_idx, a_idx, outcome = item
        
        # If attempt filter is provided
        if filter_attempt and attempts_count[(s_idx, q_idx)] > filter_attempt:
            continue
        else:
            attempts_count[(s_idx, q_idx)] += 1
        
        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
    tensor[np.where(X[:,:] == 0)] = None

    # # Average over all attempts for one question
    if attempt_agg == 'avg':
        # Used to ignore NaN warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            matrix = np.nanmean(tensor, axis=2)
            # 0.1 means the student at least tried opposing to 0, which is the he/she did not attempt the question
            matrix[np.where(matrix < 0.1)] = 0.1
        matrix[np.where(np.isnan(matrix) == True)] = 0

    return matrix
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    max_attempt = np.max(data['train_set'][:,2])+1
    student_performance_tensor = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X = np.zeros(student_performance_tensor.shape)

    student_performance = add_to_matrix(data['train_set'], 
                                        student_performance_tensor, 
                                        X, filter_attempt=n_attempts)
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test_tensor = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test_tensor.shape)
    student_performance_test = add_to_matrix(data['test_set'], 
                                             student_performance_test_tensor, 
                                             X_test, filter_attempt=n_attempts)
    return student_performance, student_performance_test

Do grid search to explore parameters

In [26]:
# %%time
n_concepts = range(2, 20)
attempts = [False, 1, 3]
l1_ratio = [0, 0.3, 0.5, 0.7, 1]
alpha = list(np.arange(0, 1, 0.1)) + list(range(1, 10, 1))

In [27]:
search = 1 * N_RUNS * len(attempts) * len(n_concepts) * len(l1_ratio) * len(alpha)
search

25650

In [30]:
days = (0.4*search)/(60*60*24)
days

0.11875

In [32]:
def run(*args):
    (dataset, data, att, student_performance, student_performance_test, 
     del_questions, concept, alpha, l1_ratio) = args
    
    # Run NMF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = non_negative_matrix_factorization(
        student_performance, concept, alpha, l1_ratio, init=10, max_iter=1000)
    end = time.time()

    # Delete questions from original q_matrix
    q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)
    question_similarity = cosine_similarity(q_matrix.T)
    question_hat_similarity = cosine_similarity(np.asarray(best_q_matrix).T)
    error = question_similarity - question_hat_similarity
    q_matrix_error = np.sqrt(np.sum(np.power(error,2)))
    q_matrix_rmse = np.sqrt(np.mean(np.power(error,2)))

    row = {
        "dataset": "run_%d" % dataset,
        "q_matrix": q_matrix,
#                         "X": X,
        "sp": student_performance,
# #                         "X_test": X_test,
        "sp_test": student_performance_test,
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
#                         "mu": m,
        "concepts": concept,
        "attempts_train": int(att),
        "method": "nmf",
        "q_matrix_error": q_matrix_error,
        "q_matrix_rmse": q_matrix_rmse,
        "reconstruction_error": best_error,
        "train_error": 0,
        "test_error": 0,
        "seconds": end-start
    }

    # Write PSQL query
    insert_query_base = "INSERT INTO EDM2020_2020_02_19 "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    query = cursor.mogrify(insert_query, query_values)
    cursor.execute(query)

In [33]:
args = []

for restart in range(1):
    for dataset in range(N_RUNS):
        with open("%s/run_%d.pkl" % (folder, dataset), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, student_performance_test = transform_data(data, att)
            
            # Check if questions have minimum number of attempts
            attempts_per_question = student_performance.sum(axis=0)           
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value == 0]
#             print(del_questions)
            student_performance = np.delete(student_performance, del_questions, axis=1)
            
            for l in l1_ratio:
                for a in alpha:
                    for concept in n_concepts:         
                        args.append([dataset, 
                                     data, 
                                     att, 
                                     student_performance, 
                                     student_performance_test, 
                                     del_questions, 
                                     concept, 
                                     a, 
                                     l])

In [34]:
len(args)

25650

In [19]:
(dataset, data, att, student_performance, student_performance_test, 
     del_questions, concept, alpha, l1_ratio) = args[12]
concept

14

In [35]:
next_id = 0

In [None]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

for item in args[next_id:]:
    run(*item)

In [36]:
def run_nmf(model, student_performance, concept, alpha, l1_ratio):
    student_knowledge = model.fit_transform(student_performance)
    q_matrix = model.components_
    error = model.reconstruction_err_
    
    return student_knowledge, q_matrix, error
    
def non_negative_matrix_factorization(student_performance, concept, alpha, l1_ratio, init=10, max_iter=200):
    best_error = 9999
    
    # First init use nndsvd
    model = NMF(n_components=concept, init='nndsvd', solver='cd', alpha=alpha, 
                l1_ratio=l1_ratio, max_iter=max_iter)
    student_knowledge, q_matrix, error = run_nmf(model, student_performance, concept, alpha, l1_ratio)
    
    if error < best_error:
        best_student_knowledge = student_knowledge
        best_q_matrix = q_matrix
        best_error = error
    
    # Generate some random inits as well
    for i in range(2, init):
        model = NMF(n_components=concept, init='random', solver='cd', 
                    alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter)
        student_knowledge, q_matrix, error = run_nmf(model, student_performance, concept, alpha, l1_ratio)
        
        if error < best_error:
            best_student_knowledge = student_knowledge
            best_q_matrix = q_matrix
            best_error = error
            
    best_student_performance_pred = np.dot(best_student_knowledge, best_q_matrix)
            
    return (best_student_performance_pred, best_student_knowledge, best_q_matrix, best_error)