In [120]:
import logging
import sys
import datetime
import pickle
from icecream import ic
from tqdm import tqdm

In [98]:

def time_now():
  
    return f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |"

ic.configureOutput(prefix=time_now)

ic.configureOutput(includeContext=True) 

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt;
np.random.seed(42);

In [4]:
from sklearn.metrics import roc_auc_score;
from sklearn.metrics import classification_report;

In [5]:
pd.set_option('display.max_columns', None);

In [6]:
df=pd.read_pickle('appmath.pkl')[['graph_id', 'user_id', 
       'goal_id',
        'interaction_end_time',
       'learning_objective_name', 'atom_id', 
       'correct', 'time_spent_answering_s', 'time_spent_on_instruction_s',
       'goal_progress',
       'target_status_and_progress', 'prev_concept_narrative',
       'duration_s', 'is_target']]  

In [7]:
df2=pd.read_csv('prereq_edges.csv')

In [8]:
G=nx.Graph()

for _,i in df2.iterrows():    
       
        G.add_edge(i['source_lo_title'],i['dest_lo_title'])
        
subgraphs=[G.subgraph(i) for i in nx.connected_components(G)]

for num,sub in enumerate(subgraphs):
    
    nx.set_node_attributes(sub,num,'subGraphId')

nodeSubGraphId={}
for g in subgraphs:
    for node in g.nodes(data=True):
         nodeSubGraphId[node[0]]= node[1]['subGraphId']
            
subgraphsNodeCountDict={ num: g.number_of_nodes() for num,g in  enumerate(subgraphs)}

In [9]:
for col in df.columns:
    if df[col].dtype.name=='category':
        df[col]=df[col].astype(str)

In [10]:
df['loGraphID']=df.learning_objective_name.apply(lambda x: nodeSubGraphId.get(x) )

In [11]:
difficulty=df[~df.correct.isna()].groupby('atom_id')['correct'].mean().reset_index().rename(columns={'correct':'difficulty'})

In [12]:
df=df.merge(difficulty,how='left',on='atom_id')

In [13]:
#loIndex={lo:i for i,lo in enumerate( df.learning_objective_name.unique())}
#df['loIndex']=df['learning_objective_name'].apply(lambda x: loIndex[x])

In [14]:
def atomClassifier(x):
    if x==None:
        return 'learningMaterial'
    else:
        return 'question'

In [15]:
df['atom_type']=df.correct.apply(lambda x:atomClassifier(x) )

In [16]:
df['correctBinary']=df.correct*1
df.atom_id=df.atom_id.astype(str)


In [17]:
df2['graph_id']=df2.apply(lambda x: nodeSubGraphId.get(x['source_lo_title']) if nodeSubGraphId.get(x['source_lo_title'])!=None else nodeSubGraphId.get(x['dest_lo_title']),axis=1)

In [18]:
unique_lo_graph_id_dic={}
for i in df2.graph_id.unique():
    unique_lo=np.unique(np.concatenate((df2[df2.graph_id==i]['source_lo_title'].unique(),((df2[df2.graph_id==i]['dest_lo_title'].unique())))))
    unique_lo_graph_id_dic[i]={ item:int(num)  for  num,item in enumerate(unique_lo)}
    df2.loc[df2.graph_id==i,'source_index' ]=df2.apply(lambda x: unique_lo_graph_id_dic.get(i).get(x['source_lo_title']),axis=1)
    df2.loc[df2.graph_id==i,'dest_index' ]=df2.apply(lambda x: unique_lo_graph_id_dic.get(i).get(x['dest_lo_title']),axis=1)

In [19]:
df2.source_index=df2.source_index.astype(int)
df2.dest_index=df2.dest_index.astype(int)

In [20]:
df=df[~df.loGraphID.isna()]

In [21]:
df['loIndex']=df.apply(lambda x: unique_lo_graph_id_dic[x['loGraphID']].get(x['learning_objective_name'] ),axis=1)

In [22]:
df2[df2.graph_id==3].max()

source_lo_id                    eda7ab6a-8070-4fd8-8387-da17a40fd99e
dest_lo_id                      eda7ab6a-8070-4fd8-8387-da17a40fd99e
source_lo_title    Write biconditional statements in symbolic for...
dest_lo_title      Write biconditional statements in symbolic for...
graph_id                                                           3
source_index                                                      30
dest_index                                                        30
dtype: object

In [23]:
loGraphID=26

In [24]:
loEdgeMapping=df2[df2['graph_id']==loGraphID][['source_index','dest_index']]
loEdgeIndex=loEdgeMapping.values.transpose()

In [25]:
from torch_geometric.data import HeteroData
import torch 


In [26]:
import torch_geometric.transforms as T

In [27]:
import torch_geometric as pyg
from torch_geometric.nn import GCNConv 
from torch_geometric.nn import SAGEConv, to_hetero ,LSTMAggregation
import  torch.nn as nn
import torch.nn.functional as F
import torch 
from torch_geometric.loader import DataLoader

In [45]:

def batch_generator(usersGraphDataList):
    
    np.random.shuffle(usersGraphDataList)
    test_size=int(len(usersGraphDataList)*.2)
    test_data=usersGraphDataList[:test_size]
    train_data=usersGraphDataList[test_size:]
    
    return test_data,train_data
    

In [52]:
def user_dataset_with_target_node(userDf,df2,user,goal,loEdgeIndex):
    
    '''
    df - df should be filtered by goal id
    
    edges
    -------
    lo -> atom
    atom -> atom
    lo -> learning materials (lm)
    atom -> lm
    lm -> atom
    lm -> lm
    lm/atom -> target
    
    node / features
    ---------------
    
    lo - radnom number
    atom - dificulty ,  correctness
    lm - time spend 
    '''
    userDf.loc[userDf.shape[0]-1,'atom_type']='target'
    
    atomIndex=np.arange(userDf[userDf.atom_type=='question'].atom_id.shape[0])
    lmIndex=np.arange(userDf[userDf.atom_type=='learningMaterial'].atom_id.shape[0])
    
    # assisgning questions and learningMaterials index seperatly in sinlge columns 'atom_index'
    userDf.loc[userDf.atom_type=='question','atomIndex']=atomIndex
    userDf.loc[userDf.atom_type=='learningMaterial','atomIndex']=lmIndex
    userDf.loc[userDf.atom_type=='target','atomIndex']=0
    
    #lo-atom
    loAtomEdgeIndex=userDf[userDf.atom_type=='question'][['loIndex','atomIndex']].values.transpose()
    #lo-lm
    loLmEdgeIndex=userDf[userDf.atom_type=='learningMaterial'][['loIndex','atomIndex']].values.transpose()
    #lo-target
    loTargetEdgeIndex=userDf[userDf.atom_type=='target'][['loIndex','atomIndex']].values.transpose()
    
    userDf['atom_type_shift']=userDf.atom_type.shift(-1)
    userDf['atomIndex_shift']=userDf.atomIndex.shift(-1)
    
    featuresDict={}
    #atom features
    atomFeatures = userDf[userDf.atom_type=='question'][['difficulty','correctBinary']].astype(float).to_numpy()
    featuresDict['atom']=atomFeatures
    #learning materials features
    #lmFeatures = userDf[userDf.atom_type=='learningMaterial'][['duration_s']].to_numpy()
    
    if  lmIndex.size>0 :
            lmFeatures = userDf[userDf.atom_type=='learningMaterial'][['duration_s']].to_numpy()
            featuresDict['lm']=lmFeatures
            
    #learning objectives
    loFeatures=np.ones(loEdgeIndex.max()+1).reshape(-1,1)
    featuresDict['lo']=loFeatures
    
    #target features
    targetFeatures = userDf[userDf.atom_type=='target'][['difficulty']].to_numpy()
    featuresDict['target']=targetFeatures
    targetCorrectness=userDf[userDf.atom_type=='target'][['correctBinary']].astype(float).to_numpy()
    
    userDf_shift= userDf[~userDf.atomIndex_shift.isna()].copy() 

    edgesDict={}
    #atom-atom
    atomEdgeIndex=userDf_shift[(userDf_shift.atom_type=='question') & (userDf_shift.atom_type_shift=='question')][['atomIndex','atomIndex_shift']].values.transpose()
    if atomEdgeIndex.size >0:
        edgesDict['atom']=atomEdgeIndex
    #atom-lmatomLmEdgeIndex
    atomLmEdgeIndex=userDf_shift[(userDf_shift.atom_type=='question') & (userDf_shift.atom_type_shift=='learningMaterial')][['atomIndex','atomIndex_shift']].values.transpose()
    if atomLmEdgeIndex.size>0:
        edgesDict['atom_lm']=atomLmEdgeIndex
    #lm-atom
    LmAtomEdgeIndex=userDf_shift[(userDf_shift.atom_type=='learningMaterial') & (userDf_shift.atom_type_shift=='question')][['atomIndex','atomIndex_shift']].values.transpose()
    if LmAtomEdgeIndex.size>0:
        edgesDict['lm_atom']=LmAtomEdgeIndex
    #lm-lm
    LmLmEdgeIndex=userDf_shift[(userDf_shift.atom_type=='learningMaterial') & (userDf_shift.atom_type_shift=='learningMaterial')][['atomIndex','atomIndex_shift']].values.transpose()
    if LmLmEdgeIndex.size>0:
        edgesDict['lm_lm']=LmLmEdgeIndex
    
    #lm-target
    LmTargetEdgeIndex=userDf_shift[(userDf_shift.atom_type=='learningMaterial') & (userDf_shift.atom_type_shift=='target')][['atomIndex','atomIndex_shift']].values.transpose()
    if LmTargetEdgeIndex.size>0:
        edgesDict['lm_target']=LmTargetEdgeIndex
    
    #atom-target
    atomTargetEdgeIndex=userDf_shift[(userDf_shift.atom_type=='question') & (userDf_shift.atom_type_shift=='target')][['atomIndex','atomIndex_shift']].values.transpose()
    
    edgesDict['atom_target']=atomTargetEdgeIndex

    if  loLmEdgeIndex.size>0:
        edgesDict['lo_lm']=loLmEdgeIndex

    if  loAtomEdgeIndex.size>0:
        edgesDict['lo_atom']=loAtomEdgeIndex
    
    edgesDict['lo_target']=loTargetEdgeIndex
    edgesDict['lo']=loEdgeIndex
    return featuresDict,edgesDict,targetCorrectness,userDf

In [53]:
def preprocess(df,df2,user,goal):
        loEdgeMapping=df2[df2['graph_id']==goal][['source_index','dest_index']]
        loEdgeIndex=loEdgeMapping.values.transpose()
      
        featuresDict,edgesDict,targetCorrectness,userDf=user_dataset_with_target_node(df,df2,user,goal,loEdgeIndex)
        
        return featuresDict,edgesDict,targetCorrectness,userDf
    
            

In [54]:
def generate_data_with_target(features,edges,targetCorrectness):  
    data = HeteroData()
    #node features
    data['atom'].x=torch.tensor(features['atom'].astype(np.float64)).float()
    data['lo'].x= torch.tensor(features['lo']).float()
    data['target'].x= torch.tensor(features['target']).float()
    
    #edge index
    data['lo','lo_atom','atom'].edge_index=torch.from_numpy(edges['lo_atom']).long()
    data['lo','lo_lo','lo'].edge_index= torch.from_numpy(edges['lo']).long()
    data['lo','lo_target','target'].edge_index=torch.from_numpy(edges['lo_target']).long()
    data['atom','atom_atom','atom'].edge_index=torch.from_numpy(edges['atom']).long()
    data['atom','atom_target','target'].edge_index=torch.from_numpy(edges['atom_target']).long()
    
    
    
    #lm edges and features
    if 'lm' in features:
        data['lm'].x= torch.tensor(features['lm']).float()
        data['lo','lo_lm','lm'].edge_index=torch.from_numpy(edges['lo_lm']).long()
        
    if 'lm_lm' in features:    
        data['lm','lm_lm','lm'].edge_index=torch.from_numpy(edges['lm_lm']).long()
    
    if 'atom_lm' in features:
        data['atom','atom_lm','lm'].edge_index=torch.from_numpy(edges['atom_lm']).long()
    
    if 'lm_atom' in features:
        data['lm','atom_lm','atom'].edge_index=torch.from_numpy(edges['lm_atom']).long()
    
    data['target'].y=torch.from_numpy(targetCorrectness).long()

    #train_mask = torch.ones(attom_count, dtype=torch.bool)
    #train_mask[target_atom] = False
    #data['atom'].train_mask = train_mask

    #normalize=T.NormalizeFeatures()
    #data=normalize(data)
    return data

In [116]:
def data_batch_generator(graph_user_list):
    data_list=[]
    try:
        for i in tqdm(range(len(graph_user_list))):
            
            featuresDict,edgesDict,targetCorrectness,userDf=preprocess(graph_user_list[i].copy(),df2,user_t,loGraphID) 
            data=generate_data_with_target(featuresDict,edgesDict,targetCorrectness)
            data_list.append(data)
        return data_list
    except Exception as e:
            ic(e)
            ic(featuresDict,edgesDict,targetCorrectness,userDf)
            

In [74]:
loGraphID=3

In [75]:
graph_df=df[(df.loGraphID==loGraphID)]

In [76]:
graph_user_list=graph_df.groupby('user_id')['atom_id'].count().sort_values()[20:100].index.to_list()

In [79]:
graph_user_df=graph_df[graph_df.user_id.isin(graph_user_list)].sort_values(['user_id','interaction_end_time']).reset_index(drop=True)

In [114]:
graph_user_list=[]
question_count=0
batch_size=10
for row_id,row in graph_user_df.iterrows():
    if row_id==0:
        user_id=row['user_id']
        user_atoms=[]
    else:
        if user_id != row['user_id']:
            question_count=0
            user_id=row['user_id']
            user_atoms=[]
    if row['atom_type'] =='question':
        question_count =question_count+1
        
        if question_count>2:
            user_atoms.append(graph_user_df.iloc[:row_id+1])
        if len(user_atoms)%batch_size==0:
            graph_user_list.extend(user_atoms)
            user_atoms=[]
            

In [117]:
len(graph_user_list)

1200

In [118]:
data_list=data_batch_generator(graph_user_list)

100%|██████████████████████████████████████████████████████████████████████████████| 1200/1200 [01:28<00:00, 13.50it/s]


In [121]:
file_name = "data_list.pkl"
open_file = open(file_name, "wb")

pickle.dump(data_list, open_file)
open_file.close()