In [None]:
import numpy as np
from psy import EmDina, MlDina
from psy.utils import r4beta

In [None]:
attrs = np.random.binomial(1, 0.5, (5, 60))
skills = np.random.binomial(1, 0.7, (1000, 5))


In [None]:
print(attrs)
print(attrs.shape)

In [None]:
print(skills)
print(skills.shape)

In [None]:
g = r4beta(1, 2, 0, 0, (1, 60))
no_s = r4beta(2, 1, 1, 1, (1, 60))

In [None]:
print(g)

print(g.shape)

In [None]:
print(no_s)
print(no_s.shape)

In [None]:
temp = EmDina(attrs=attrs)
yita = temp.get_yita(skills)
p_val = temp.get_p(yita, guess=g, no_slip=no_s)
score = np.random.binomial(1, p_val)


In [None]:
# 5 attributes, 60 items

print(attrs)
print(attrs.shape)

In [None]:
# convert score to float
score = score.astype(float)

# input NaN randomly into score
score[np.random.binomial(1, 0.1, score.shape).astype(bool)] = np.nan

In [None]:
# Missing data imputation

# (1) Convert NaN in score to 0
score[np.isnan(score)] = 0

In [None]:
print(score)
print(score.shape)

# 1000 students, 60 items, 5 attributes

In [None]:
# Estimate the parameters
em_dina = EmDina(attrs=attrs, score=score)
est_no_s, est_g = em_dina.em()

print(np.mean(np.abs(est_no_s - no_s)))
print(np.mean(np.abs(est_g - g)))

In [None]:

# Estimate the student's skill mastery
dina_est = MlDina(guess=est_g, no_slip=est_no_s, attrs=attrs, score=score)
est_skills = dina_est.solve()

In [None]:
print(est_skills)
print(est_skills.shape)

### Test on PSY107

In [1]:
import pandas as pd
import numpy as np
from psy import EmDina, MlDina
from psy.utils import r4beta

In [2]:
df = pd.read_csv("data/PSY107 Report 1.csv")
curriculum_mapping = pd.read_csv("data/PSY107 curriculum_mapping.csv")

In [None]:
student_ability_df = df.iloc[1:, [0, 1, 2]]
student_ability_df = student_ability_df.rename(
    columns={'Unnamed: 0': 'Student ID', 'Item ID': 'Student Ability', 'Unnamed: 2': 'SE'}).reset_index(drop=True)

# convert student ability to numeric
student_ability_df['Student Ability'] = pd.to_numeric(student_ability_df['Student Ability'])
# convert SE to numeric
student_ability_df['SE'] = pd.to_numeric(student_ability_df['SE'])
# create column Upper_SEM
student_ability_df['Upper_SE'] = student_ability_df['Student Ability'] + student_ability_df['SE']
# create column Lower_SEM
student_ability_df['Lower_SE'] = student_ability_df['Student Ability'] - student_ability_df['SE']

# Extract the first three rows starting from the second column
df = df.drop(['Unnamed: 2'], axis=1)
item_df = df.iloc[:1, 1:]
item_df = item_df.T
item_df = item_df.reset_index()
item_df = item_df.rename(columns=item_df.iloc[0]).drop(item_df.index[0]).reset_index(drop=True)
item_df.rename(columns={'Student Ability': 'Item Difficulty'}, inplace=True)

new_df = df.iloc[1:, :]
new_df = new_df.rename(columns={'Unnamed: 0': 'Student ID', 'Item ID': 'Student Ability'})
new_df = new_df.set_index(['Student ID', 'Student Ability']).rename_axis(
    ['Item ID'], axis=1).stack().reset_index().rename(columns={0: 'Response'})
# drop rows in column 'Response' with blanks
new_df = new_df[new_df['Response'] != ' ']
new_df = new_df[new_df['Response'] != '']

# merge the two dataframes of student ability and item difficulty
new_df = new_df.merge(item_df, on='Item ID', how='left')
new_df = new_df.merge(student_ability_df[['Student ID', 'SE', 'Upper_SE', 'Lower_SE']], on='Student ID', how='left')

# convert new_df['Student Ability` and new_df['Item Difficulty'] to numeric`]
new_df['Student Ability'] = pd.to_numeric(new_df['Student Ability'])
new_df['Item Difficulty'] = pd.to_numeric(new_df['Item Difficulty'])
# convert new_df['Item ID'] to numeric
new_df['Item ID'] = pd.to_numeric(new_df['Item ID'])
# convert new_df['SE'] to numeric
new_df['SE'] = pd.to_numeric(new_df['SE'])

# Replace string '1.0' to '1' 
new_df['Response'] = new_df['Response'].replace(1.0, '1')
# Replace string '0.0' to '0'
new_df['Response'] = new_df['Response'].replace(0.0, '0')

# convert new_df['Response'] to object
new_df['Response'] = new_df['Response'].astype(str)

In [None]:
# Pivot the dataframe to create the Q-matrix
new_matrix = new_df.pivot(index='Student ID', columns='Item ID', values='Response')

# Fill missing values with 0 (assuming no response means a 0)
new_matrix = new_matrix.fillna(0)

# Convert 1s and 0s to float
new_matrix = new_matrix.astype(float)

# Get array
psy107_scores = new_matrix.values

In [None]:
curriculum_mapping = curriculum_mapping[['Item ID', 'Node']]

# filter out the Item_ID that are not found in new_matrix.columns
curriculum_mapping = curriculum_mapping[curriculum_mapping['Item ID'].isin(new_matrix.columns)]

In [None]:
# Get unique item IDs and nodes
item_ids = curriculum_mapping['Item ID'].unique()
nodes = curriculum_mapping['Node'].unique()

# Create an empty Q-matrix
q_matrix = pd.DataFrame(0, columns=nodes, index=item_ids)

# Populate the Q-matrix with 1s based on associations
for index, row in curriculum_mapping.iterrows():
    item_id = row['Item ID']
    node = row['Node']
    q_matrix.loc[item_id, node] = 1

In [None]:
q_matrix

In [None]:
psy107_attributes = q_matrix.T.values
print(psy107_attributes[0])
print(psy107_attributes.shape)  # 9 attributes, 83 items

In [None]:
print(psy107_scores.shape)
print(psy107_attributes.shape)

In [2]:
# set seed 
np.random.seed(123)

# create binary response of 15 items and 100 students
dat = np.random.binomial(1, 0.5, (100, 15))
# convert dat to float
dat = dat.astype(float)
# randomly insert na into dat
dat[np.random.binomial(1, 0.1, dat.shape).astype(bool)] = np.nan
print(dat)
print(dat.shape)

print('')
# create array of 15 items and 3 attributes
Q = np.random.binomial(1, 0.5, (3, 15))
print(Q)
print(Q.shape)

# Estimate the parameters
em_dina = EmDina(attrs=Q, score=dat, max_iter=100, tol=1e-3)
est_no_s, est_g = em_dina.em()

# Estimate the student's skill mastery
dina_est = MlDina(guess=est_g, no_slip=est_no_s, attrs=Q, score=dat)
est_skills = dina_est.solve()

print(est_skills)


[[ 1.  0. nan ...  0.  0.  0.]
 [ 1.  0.  0. ... nan  0.  1.]
 [ 0.  0.  0. ...  0.  0.  1.]
 ...
 [ 1.  1.  1. ... nan nan  1.]
 [ 0. nan  0. ...  1.  1.  0.]
 [ 0.  0.  1. ...  0.  1.  0.]]
(100, 15)

[[1 1 0 1 0 1 1 0 0 1 1 0 0 1 0]
 [0 1 0 1 1 0 0 1 1 0 1 0 1 1 1]
 [0 0 1 1 1 1 0 0 0 0 0 1 0 0 0]]
(3, 15)
iter 0
1
2
3
4
5
6
guess [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
no_slip [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
iter 1
1
2
3
4
5
6
guess [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
no_slip [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
iter 2
1
2
3
4
5
6
guess [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
no_slip [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
iter 3
1
2
3
4
5
6
guess [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
no_slip [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
iter 4
1
2
3
4
5
6
guess [nan nan nan nan nan nan nan nan nan

ConvergenceError: no Convergence

In [45]:
est_skills

array([[1, 0, 1],
       [1, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 1],
       [1, 1, 0],
       [0, 0, 1],
       [0, 1, 1],
       [1, 1, 0],
       [1, 0, 0],
       [0, 1, 0],
       [1, 1, 1],
       [0, 1, 0],
       [0, 1, 0],
       [1, 1, 0],
       [0, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 1],
       [1, 0, 0],
       [1, 1, 0],
       [0, 1, 0],
       [0, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [1, 1, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [1, 1, 0],
       [1, 0, 1],
       [0, 1, 1],
       [0, 0, 0],
       [1, 1, 0],
       [0, 1, 0],
       [1, 0, 0],
       [1,

In [None]:
# Estimate the parameters
em_dina = EmDina(attrs=psy107_attributes, score=psy107_scores, max_iter=10000, tol=1e-3)
est_no_s, est_g = em_dina.em()


In [None]:
est_g, est_no_s = 0., 0.

# Estimate the student's skill mastery
dina_est = MlDina(guess=est_g, no_slip=est_no_s, attrs=psy107_attributes, score=psy107_scores)
est_skills = dina_est.solve()

In [None]:
# get random int
random_int = np.random.randint(0, 100)
print(random_int)
est_skills[random_int]

In [None]:
est_skills.shape

In [None]:
# show entire column
pd.set_option('display.max_columns', None)
new_matrix[new_matrix.index == 'abigaillim002@suss.edu.sg']

In [None]:
# merge curriculum_mapping to new_df based on Item_ID
new_df_2 = new_df.merge(curriculum_mapping, on='Item ID', how='left')

In [None]:
est_skills[0]

In [None]:
new_df_2[new_df_2['Student ID'] == 'abigaillim002@suss.edu.sg'].sort_values(by=['Item ID'])