In [2]:
from config import *
from main_package.utils import csv_to_df_fixed_params
from typing import Dict, Iterable, List, Set, Tuple
from math import exp, inf
from numpy.random import normal
from scipy.optimize import minimize
from scipy.special import expit # logistic function expit(x) = 1/(1+exp(-x))
from copy import deepcopy
import pandas as pd
from sklearn.metrics import mean_squared_error

In [3]:
df_train_fixed_params = csv_to_df_fixed_params("assistment/train_v3")
df_test_fixed_params = csv_to_df_fixed_params("assistment/test_v3")

In [4]:
df_test_fixed_params.head()

Unnamed: 0,user_id,skill_id,correct
0,78371,67,False
1,78371,67,False
2,78371,279,True
3,78371,279,True
4,78371,67,True


In [5]:
df_test_fixed_params.dtypes

user_id     object
skill_id    object
correct       bool
dtype: object

In [6]:
df_test_fixed_params['skill_id'].sample(15)

10357      278
38327       27
23365       13
10145      311
34891    10_14
32360      278
30713    11_70
37294       47
364         50
13187      325
38951      279
9456       368
19835       65
19121       86
12723      311
Name: skill_id, dtype: object

In [5]:
def find_accumulating_answer_counts(df_fixed_params):
    dictionaries = [
        {
            True: {},
            False: {},
        }
    ]
    for _, row in df_fixed_params.iterrows():
        new_dict = deepcopy(dictionaries[-1])
        for skill in row['skill_id'].split('_'):
            new_dict[row['correct']][skill] = new_dict[row['correct']].get(skill, 0) + 1
        dictionaries.append(new_dict)
    df_fixed_params['cumulative_count_dictionary'] = dictionaries[1:]


In [7]:
find_accumulating_answer_counts(df_train_fixed_params)
df_test_fixed_params.head()

Unnamed: 0,user_id,skill_id,correct,cumulative_count_dictionary
0,78371,67,False,"{True: {}, False: {'67': 1}}"
1,78371,67,False,"{True: {}, False: {'67': 2}}"
2,78371,279,True,"{True: {'279': 1}, False: {'67': 2}}"
3,78371,279,True,"{True: {'279': 2}, False: {'67': 2}}"
4,78371,67,True,"{True: {'279': 2, '67': 1}, False: {'67': 2}}"


In [38]:
from time import perf_counter

In [39]:
def factor_analysis(
    scale_difficulty: Dict[str, float], 
    scale_correct: Dict[str, float], 
    scale_incorrect: Dict[str, float],
    df_fixed_params: pd.DataFrame,
):
    start = perf_counter()
    def calculate_m_value_for_row(row) -> float:
        return sum(
            scale_difficulty[skill] +
            scale_correct[skill] * row['cumulative_count_dictionary'][True].get(skill, 0) +
            scale_incorrect[skill] * row['cumulative_count_dictionary'][False].get(skill, 0)
            for skill in row['skill_id'].split('_')
        )
    m_values = df_fixed_params.apply(calculate_m_value_for_row, axis=1)
    predictions = expit(m_values)
    end = perf_counter()
    print(f"time for factor analysis: {round(end-start, 1)}")
    return predictions

In [14]:
def random_initial_constants(skills: Iterable) -> Tuple[Dict[str, float]]:
    unique_skills = set(skill for skill_string in skills
                            for skill in skill_string.split('_'))
    scale_difficulty = {}
    scale_correct = {}
    scale_incorrect = {}
    for skill in unique_skills:
        scale_difficulty[skill] = normal(loc=0, scale=0.5)
        scale_correct[skill] = normal(loc=0.2, scale=0.5)
        scale_incorrect[skill] = normal(loc=0.1, scale=0.5)
    return scale_difficulty, scale_correct, scale_incorrect

In [23]:
df_train_fixed_params.head()

Unnamed: 0,user_id,skill_id,correct,cumulative_count_dictionary
0,73963,297,False,"{True: {}, False: {'297': 1}}"
1,73963,297,True,"{True: {'297': 1}, False: {'297': 1}}"
2,73963,297,True,"{True: {'297': 2}, False: {'297': 1}}"
3,73963,297,True,"{True: {'297': 3}, False: {'297': 1}}"
4,73963,297,False,"{True: {'297': 3}, False: {'297': 2}}"


In [41]:
def find_good_initial_scales(df_fixed_params: pd.DataFrame, num_iteration=100):
    scales = random_initial_constants(df_fixed_params['skill_id'])
    best_scales = scales
    predictions = factor_analysis(*scales, df_fixed_params)
    current_best_mse = mean_squared_error(df_fixed_params['correct'], predictions, squared=False)
    for _ in range(num_iteration - 1):
        new_scales = random_initial_constants(df_fixed_params['skill_id'])
        new_predictions = factor_analysis(*new_scales, df_fixed_params)
        new_mse = mean_squared_error(df_fixed_params['correct'], predictions, squared=False)
        if new_mse < current_best_mse:
            current_best_mse = new_mse
            best_scales = new_scales
    with open("best_initial_scales.txt", "a") as f:
        f.write(f"num_iteration: {num_iteration}\nmse: {current_best_mse}\nbest_scales: {best_scales}\n")
    return current_best_mse, best_scales

initial_scales = find_good_initial_scales(df_train_fixed_params, 5)

time for factor analysis: 5.9
time for factor analysis: 4.9
time for factor analysis: 4.3
time for factor analysis: 4.8
time for factor analysis: 4.6


In [None]:
def pfa_minimize()

In [33]:
#pfa_minimize(initial_params, train_fixed_params, skills)
current_min = inf
result = minimize(pfa_minimize, initial_params, (train_fixed_params, skills))