In [1]:
import numpy as np
import pandas as pd

from knight.scrubbing import describe_dupe_cor_ans, get_item_cor_ans, recode_as_omitted, timing_exclusion, remove_repeat_questions, combine_CIinfo, removed_record_count, clean_item_data
from knight.scrubbing import make_user_level_matrices, make_item_level_info, make_activity_level_info, make_user_level_info, resp_cleaning

from knight.salvador import color, teleg_msg, merge_size, ch_dtype, db_con

In [2]:
program = 'USMLE_s2'
analysis_name = program
results_path = 'C:\\Users\\VImmadisetty\\Downloads\\DATA\\DRCR\\MED 2022\\Results\\' + program + '\\'
data_path = 'C:\\Users\\VImmadisetty\\Downloads\\DATA\\DRCR\\MED 2022\\Data\\' + program + '\\'

# Reading the data

In [3]:
activity_df = pd.read_csv(data_path + 'activity_info.tsv', sep='\t', parse_dates=['timestamp_created', 'timestamp_completed']).drop_duplicates(ignore_index=True)
response_df = pd.read_csv(data_path + 'response_data.tsv', sep='\t').drop_duplicates(ignore_index = True)

content_df = pd.read_csv(data_path + 'content_info.tsv', sep='\t', parse_dates = ['last_modified']).drop_duplicates(ignore_index = True)

In [4]:
print('Shape of activity_df :', activity_df.shape)
print('Total Students : ', activity_df['student_id'].nunique())
print('Total Activities : ', activity_df['activity_id'].nunique())
print('Total Enrollments : ', activity_df['enrollment_id'].nunique())
print('Template Id : ', activity_df['template_id'].unique())

print('-'*50)

print('Shape of response_df :', response_df.shape)
print('Total Students : ', response_df['student_id'].nunique())
print('Total Activities : ', response_df['activity_id'].nunique())
print('Total Content Item Ids : ', response_df['content_item_id'].nunique())
print('Total Content Item Names : ', response_df['content_item_name'].nunique())

print('-'*50)

print('Shape of content_df :', content_df.shape)
print('Total Content Item Ids : ', content_df['content_item_id'].nunique())
print('Total Content Item Names : ', content_df['content_item_name'].nunique())
print('Is Correct_answer column available? ', 'correct_answer' in content_df.columns)
#print('Data available in correct answer columns :', content_df['correct_answer'].nunique())

Shape of activity_df : (60368, 25)
Total Students :  3572
Total Activities :  60368
Total Enrollments :  4069
Template Id :  [nan]
--------------------------------------------------
Shape of response_df : (1169771, 21)
Total Students :  3572
Total Activities :  60367
Total Content Item Ids :  4697
Total Content Item Names :  2716
--------------------------------------------------
Shape of content_df : (4697, 36)
Total Content Item Ids :  4697
Total Content Item Names :  2716
Is Correct_answer column available?  True


# Data handling

In [5]:
response_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'jasperUserId',
                            'activity_id' : 'sequenceId',
                            'item_position': 'position',
                            'section_title' : 'sectionTitle',
                            'item_section_position' : 'displaySeq',
                            'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction_type': 'interactionType',
                            'milliseconds_used' : 'mSecUsed',
                            'is_scored' : 'scored',
                            'scored_response' : 'score',
                            'raw_response' : 'response',
                            'field_test' : 'fieldTest',
                           'item_status': 'responseStatus'}, inplace = True)
#response_df['correctAnswer'] = np.nan
response_df['sectionName'] = response_df['sectionTitle']

response_df['contentItemName'] = np.where(pd.isnull(response_df['contentItemName']), response_df['contentItemName'], response_df['contentItemName'].astype('str').str.lower())

In [6]:
#userids are int64 in activity df and string in response df so we are removing student ids that are actual strings
#from response df and merging with activity df
numers = [x for x in response_df['jasperUserId'].unique() if (type(x) == int or type(x) == float or type(x) == np.int64 or type(x) == np.float64 or x.isnumeric()) and not pd.isna(x)]

response_df = response_df[response_df['jasperUserId'].isin(numers)].copy()

In [7]:
#Below to avoid issues while merging with activity_df
#below will change the string type of ints to ints
response_df['jasperUserId'] = response_df['jasperUserId'].apply(lambda x: ch_dtype(x))
response_df['sequenceId'] = response_df['sequenceId'].apply(lambda x: ch_dtype(x))
response_df['historiesDb'] = response_df['historiesDb'].apply(lambda x: ch_dtype(x))

In [8]:
activity_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'jasperUserId',
                            'enrollment_id' : 'kbsEnrollmentId',
                            'activity_id' : 'sequenceId',
                            'template_id' : 'templateId',
                            'template_name' : 'sequenceName',
                            'sequence_title' : 'sequenceTitle',
                            'timestamp_created' : 'dateCreated',
                            'timestamp_completed' : 'dateCompleted',
                           'tutor_mode' : 'tutorMode',
                           'status':'sequenceStatus'}, inplace = True)

activity_df['jasperUserId'] = activity_df['jasperUserId'].apply(lambda x: ch_dtype(x))
activity_df['sequenceId'] = activity_df['sequenceId'].apply(lambda x: ch_dtype(x))
activity_df['historiesDb'] = activity_df['historiesDb'].apply(lambda x: ch_dtype(x))

#merging responses and activity data and changing the names of statuses
print('Response_df size before merging with activity_df :', response_df.shape)
initi_cols = response_df.columns
response_df = pd.merge(response_df.drop(columns=['source_system']), activity_df.drop(columns=['source_system']), on = ['jasperUserId', 'sequenceId', 'historiesDb'], how = 'inner')

print('Response_df size after merging with activity_df:', response_df.shape)
print('\nNew columns added from activity_df with activity_df:\n', set(response_df)^set(initi_cols))

Response_df size before merging with activity_df : (1169771, 22)
Response_df size after merging with activity_df: (1169771, 42)

New columns added from activity_df with activity_df:
 {'activity_type', 'total_scored_items', 'tutorMode', 'product_code', 'dateCompleted', 'sequenceName', 'total_scored_items_not_reached', 'total_items_remediated', 'source_system', 'sequenceStatus', 'templateId', 'program', 'total_scored_items_time_elapsed', 'activity_title', 'total_items_remediation_time', 'dateCreated', 'alternate_timing', 'product_name', 'kbsEnrollmentId', 'total_scored_items_answered_correct', 'total_scored_items_omitted', 'total_scored_items_answered'}


In [9]:
#changing kbs_enrollment_id to string as in cleaning function to remove 0 kbseids we are comparing as string
response_df['kbsEnrollmentId'].fillna(value = 0, inplace = True)
response_df['kbsEnrollmentId'] = response_df['kbsEnrollmentId'].astype('str')

In [10]:
content_df = content_df[['source_system', 'content_item_id', 'content_item_name', 'content_item_type', 'interaction_type_name', 'count_choices', 'correct_answer', 'last_modified']].copy()

#creating item rank based on last modified to get latest item's item_id
content_df['item_rank'] = content_df.groupby(['content_item_name'])['last_modified'].rank(method = 'first', ascending=False)

#extracting count_choices and correct_answer on item_name if they are same at all in each group
#fortunately they are all same
item_metadata = content_df.groupby(['content_item_name'], as_index = False).agg(count_choices = ('count_choices', lambda x: list(np.unique(x))[0]),
                                                                correct_answer = ('correct_answer', lambda x: list(np.unique(x))[0]))

#merging with meta data
content_df = pd.merge(content_df[content_df['item_rank']==1].drop(columns=['count_choices', 'correct_answer', 'item_rank']), item_metadata, on = ['content_item_name'])

In [10]:
content_df.rename(columns = {'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction_type_name' : 'interactiontypename',
                            'count_choices' : 'countchoices',
                            'parent_item_id' : 'parentid',
                            'parent_item_name' : 'parentname',
                            'correct_answer':'correctAnswer'}, inplace = True)

content_df['contentItemName'] = np.where(pd.isnull(content_df['contentItemName']), content_df['contentItemName'], content_df['contentItemName'].astype('str').str.lower())


#category name vary based on problem, so check the data
#merging with response_df
resp_bef_size = response_df.shape[0]
print('shape of response_df before merging with content_df :', response_df.shape)

response_df = pd.merge(response_df, content_df,
                       how = 'inner',
                      on = ['contentItemId', 'contentItemName'])
resp_aft_size = response_df.shape[0]

print('shape of response_df after merging with content_df :', response_df.shape)

if (resp_bef_size != resp_aft_size):
    print(color.BOLD + 'Size of response df is not same after merging with content df' + color.END)

shape of response_df before merging with content_df : (1169771, 42)
shape of response_df after merging with content_df : (1169771, 76)


In [11]:
#cleaning and sorting raw_response based on interaction-type
response_df['interactionType'] = response_df['interactionType'].str.lower()
response_df.loc[~(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in'])), 'response'] = response_df[~(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in']))]['response'].apply(lambda x: resp_cleaning(x) if not pd.isna(x) else x)

response_df.loc[(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in'])), 'response'] = response_df[(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in']))]['response'].apply(lambda x: x.strip() if not pd.isna(x) else x)

In [12]:
#summary to from response df to make test map
response_summary = response_df[response_df['sequenceStatus'].str.lower() == 'completed'].groupby(['templateId', 'sequenceName', 'jasperUserId'], as_index = False, dropna = False).agg(num_responses = ('contentItemName', 'nunique'))\
.groupby(['templateId', 'sequenceName'], dropna = False, as_index = False).agg(min_resp = ('num_responses', 'min'),
                                            median_resp = ('num_responses', 'median'),
                                            max_resp = ('num_responses', 'max'),
                                            num_users = ('num_responses', 'count'))

response_summary

Unnamed: 0,templateId,sequenceName,min_resp,median_resp,max_resp,num_users
0,,usmle-s2-qbank-sequence-recipe,1,90,2532,3572


* **Reading Qbank items**

In [30]:
comlex_l1 = pd.read_excel('C:\\Users\\VImmadisetty\\Downloads\\DATA\\DRCR\\MED 2022\\QBank Item List.xlsx', sheet_name= program).drop_duplicates()
comlex_l1.rename(columns = {'QID': 'contentItemName'}, inplace=True)
comlex_l1['contentItemName'] = np.where(pd.isnull(comlex_l1['contentItemName']), comlex_l1['contentItemName'], comlex_l1['contentItemName'].astype('str').str.lower())

In [31]:
items_merging = pd.merge(response_df[['contentItemName']].drop_duplicates(), comlex_l1, on = ['contentItemName'], how = 'outer', indicator = True)
unmatched_items = items_merging[items_merging['_merge']=='right_only'][['contentItemName']]
unmatched_items.to_csv(results_path + program + '_unmatched_items.csv', index = False)

matched_items = items_merging[items_merging['_merge']=='both'][['contentItemName']]

In [32]:
unmatched_items.reset_index()

Unnamed: 0,index,contentItemName
0,2714,q0004m
1,2715,m000989m2
2,2716,m000814m
3,2717,q0647m3
4,2718,q0170m
5,2719,q0401m
6,2720,s2s359m
7,2721,mb000080m2
8,2722,m000402m
9,2723,m000403m


In [13]:
test_map = pd.DataFrame()
test_map['templateId'] = np.nan
test_map['jasperSequenceName'] = response_summary['sequenceName']

test_map['sectionName'] = np.nan

test_map['numQues'] = np.nan
test_map['responseThreshold'] = np.nan
test_map['minutesAllowed'] = np.nan

test_map

Unnamed: 0,templateId,jasperSequenceName,sectionName,numQues,responseThreshold,minutesAllowed
0,,usmle-s2-qbank-sequence-recipe,,,,


# Connecting to Db for Repeats enrolls

In [19]:
try:
    engine
except:
    engine = db_con()

query_frt = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    lower(prd.product_subtype) in ('free trial');
    """

try:
    frt_enrols
except:
    frt_enrols = pd.read_sql(sql = query_frt, con = engine)

query_olc = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    lower(prd.product_subtype) in ('online companion')
    ;"""

try:
    olc_enrols
except:
    olc_enrols = pd.read_sql(sql = query_olc, con = engine)

query_hsg_repeat = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    ph.initial_delta_k_txn_code in ('404', '405', '406');
    --and ph.created_on >= '2020-01-01';
"""

try:
    repeaters
except:
    repeaters = pd.read_sql(sql = query_hsg_repeat, con = engine)
    repeaters['kbsenrollmentid'] = repeaters['kbsenrollmentid'].astype('str')

# Cleaning the data

In [14]:
#If we know the correct answers for all the items in response_df and correct answer column in response_df is not null
#then we can use no_correctAnswer = False
#and by default we work on contentItemId to get multiple correct answers
#we get correct answers df here, we need to filter items that are having multiple versions of items

cor_ans, dup_cor_ans, dup_ca_cnt = get_item_cor_ans(response_df, no_correctAnswer = True, use_contentItemName=True)

  contentItemName response  score_count            min_date  \
0       m000319mk      3.0           24 2022-01-10 18:25:08   
1       m000319mk      4.0           22 2022-01-10 19:36:01   

             max_date  
0 2022-02-28 06:14:34  
1 2022-03-01 15:28:59  
[1mDuplicate correct answers for some items[0m


In [21]:
dup_ca_cnt

Unnamed: 0,contentItemName,response,score_count,min_date,max_date
0,m000319mk,3.0,24,2022-01-10 18:25:08,2022-02-28 06:14:34
1,m000319mk,4.0,22,2022-01-10 19:36:01,2022-03-01 15:28:59


In [21]:
#cor_ans = cor_ans.drop(columns = ['contentItemId', 'displaySeq']).drop_duplicates(ignore_index = True)

In [15]:
cor_ans.groupby(by=['contentItemName']).filter(lambda x: len(x)>1).sort_values(by=['contentItemName'])

Unnamed: 0,contentItemId,contentItemName,displaySeq,correctAnswer,max_dateCreated


In [23]:
#removing rows that are not needed for duplicate answer key items, so be careful with this
#cor_ans = cor_ans.drop(labels = [1708])

In [24]:
CI_old_keys = pd.DataFrame({'contentItemName':['m000319mk'],
                           'correctAnswer':['3.0']})

CI_old_keys

Unnamed: 0,contentItemName,correctAnswer
0,m000319mk,3.0


## Running cleaning function

In [25]:
data_pool = {'CI_old_version_dates' : pd.DataFrame(),
'CI_old_version_list' : pd.DataFrame(),
'CI_old_keys' : CI_old_keys,
'frt_enrols' : frt_enrols,
'olc_enrols' : olc_enrols,
'repeaters' : repeaters,
'section_map' : pd.DataFrame(),
'test_map' : test_map,
'seqHist_to_exclude' : pd.DataFrame(),
'cidf' : pd.DataFrame(),
'field_test_items' : pd.DataFrame(),
'ci_cols_to_include' : pd.DataFrame(),
'interaction_type_list' : pd.DataFrame()}

#field_test_items should have column name'contentItemName' to match with response_df

In [26]:
result, cleaning_info, rejects_df = clean_item_data(data_path = data_path,
                         results_path = results_path,
                        analysis_name = analysis_name,
                         resp = response_df,
                         remove_users_deleted_sequences = True,
                         remove_dup_CIs = True,
                         remove_no_kbsEID = True,
                         remove_deleted_sequences = True,
                         remove_impo_response_scored = True,
                         remove_impo_timing_seq = True,
                         remove_seq_w_tmq = True,
                         remove_staged_responses = False,
                         remove_FT_items = False,
                        data_pool = data_pool,
                         CI_remove_before_after = 'before', #applicable for old version dates
                         repeat_treatment = 'omit', #default omit
                         mSec_min_threshold = 5000, #there's y/n condition for timing if none provided
                         mSec_max_threshold = 600000, #so if condition is not needed.
                         sec_min_threshold = None,
                         sec_max_threshold = None,
                         remove_frt_users = True,
                         remove_olc_users = True,
                         remove_repeat_enrolls = True,
                         remove_tutor = True,
                         remove_ada_seq = True,
                         remove_untimed_seq = True,
                         remove_incomplete_seq = False,
                         seq_item_minutes_threshold = None, #input in terms of minutes here
                         seq_section_minutes_threshold = None, #input in terms of minutes here
                         seq_total_minutes_threshold = None, #input in terms of minutes here
                         qbank = True, #if qbank==false and section_map is not empty then sec_num_attempted is validated against given min_items_per_seq from secton_map
                         min_items_per_seq = None, #if qbank==true and section_map provided, sequence_num_attempted is validated against given min_items_per_seq
                         section_calc = True, #if section map provided but min_item_per_seq column not present in it & qbank == false then this gets activated for validating sec_perc_attempted vs section_resp_threshold(comes from section_map)
                         #test_map = test_map, #if section_map is empty this gets activated, and qbank==false then test_resp_threshold comes from test_map
                         #seq_item_resp_threshold = .75,
                         remove_unscored = False,
                         #seqHist_to_exclude = pd.DataFrame(), #should have column sequenceId in it
                         precombined_files = True, #used to add columns from cidf to response_df
                         #cidf = pd.DataFrame(), #pass cidf
                         #interaction_type_list = [], #pass interactionTypeIds in this list
                         #ci_cols_to_include = [], #pass columns names of cidf to include in response_df (ciname, ciid by default added if precombined files is True)
                         remove_repeat_test_administrations = False,
                        remove_seq_wo_dispseq = True)
    

Starting clean item data function at 2022-04-28 20:10:05
Total sequences at start:  60367
Total users at start:  3572
Unique items at start:  2714
Total responses at start:  1169771

Working on Disqualifiers :
User Removal
--------------------
Sequences with deleted names are removed: NONE

Sequences with dupe content items within the same exam, removed:  106
Users removed  1
Unique items removed  0
Responses removed  4855

Sequences with no KBS EID removed: NONE

Sequence Removal
--------------------
Sequences with deleted names are removed: NONE

Sequences with bad records (response = 0 with score = 1), removed: NONE

Sequences with bad timing (mSecUsed < 0), removed: NONE

Here are the new columns after joining all the test and section maps
{'test_num_ques', 'test_minutes_allowed', 'test_response_threshold'}

You are asking if there are too many questions for a qbank which is bad :(
Item Removal
--------------------
Total item responses:  1164916 

Working on Cleaning Rules :
Respon

In [27]:
#Extra check requested to have first attempt across multiple attempted sequences
#Since the focus is on items we do this explicitly
sub_sec = 'First attempted items'
keep_first_attempted_items = True
#Remove sequences for the users who practiced more than once
if(keep_first_attempted_items == True):
    seq_to_exclude_calc7 = result.copy()
    #seq_to_exclude_calc7 = seq_to_exclude_calc7[['studentId', 'contentItemName', 'dateCreated']].drop_duplicates()
    seq_to_exclude_calc7['item_attempt_order'] = seq_to_exclude_calc7.groupby(by=['studentId', 'contentItemName'])['dateCreated'].rank(method = 'first')
    item_to_exclude = seq_to_exclude_calc7[seq_to_exclude_calc7['item_attempt_order']>1]['contentItemName'].unique()
        
    if(len(item_to_exclude) > 0):
        #adding data to rejects df
        #temp_rej = respExcl[respExcl['sequenceId'].isin(seq_to_exclude_calc7)][['jasperUserId', 'kbsEnrollmentId', 'templateId']].drop_duplicates()
        #temp_rej['Reason'] = 'Remove sequences for the users who practiced more than once'
        #rejects_df = pd.concat([rejects_df, temp_rej], ignore_index = True)
        
        result = seq_to_exclude_calc7[seq_to_exclude_calc7['item_attempt_order']==1].copy()
        
    #num_seq_current, num_users_current, num_items_current, num_responses_current, cleaning_info = removed_record_count(respExcl, cleaning_info, sec, sub_sec, num_seq_current, num_users_current, num_items_current,
     #                                                                                                                      num_responses_current
      #                                                                              , things_to_say = 'Sequences that were not the first administration for the user, removed: ')


print('Remaining number of responses in final output: ', result.shape[0])
print('Remaining number of sequences in final output: ', result['activityId'].nunique())
print('Remaining number of users in final output: ', result['studentId'].nunique())
print('Remaining number of unique items in final output: ', result['contentItemName'].nunique())

Remaining number of responses in final output:  1036865
Remaining number of sequences in final output:  57252
Remaining number of users in final output:  3571
Remaining number of unique items in final output:  2714


## Filtering with given items list

In [28]:
result = pd.merge(result, comlex_l1, on = ['contentItemName'], how = 'inner')

## Exporting cleaning info & cleaned response_data

In [29]:
#Exporting cleaned response_data
result = result.sort_values(by=['studentId', 'activityId', 'displaySeq'], ignore_index= True)
result.to_csv(results_path+analysis_name+'_responseData.csv', index = False)

#Exporting cleaning info
cleaning_info.to_csv(results_path+analysis_name+'_cleaningInfo.csv')

#Exporting rejects info
rejects_df.sort_values(by=['studentId', 'templateId'], ignore_index = True).to_csv(results_path+analysis_name+'_rejects_info.csv', index = False)

In [16]:
result = pd.read_csv(results_path+analysis_name+'_responseData.csv')

# Creating User Level Matrices

In [20]:
result = result[~(result['studentId'].isin(null_users))]

In [21]:
#Reading cleaned data
all_resp = result.copy()

all_resp = all_resp.sort_values(by=['activityName', 'displaySeq'], ignore_index=True)

#col_name:filename
vars_for_matrix =  {'score':'Item_Scores',
                   'response':'Responses',
                   'mSecUsed':'Milliseconds_per_Item'
                   }

#making matrices
big_matrix = make_user_level_matrices(all_resp,
                               vars_for_matrices = vars_for_matrix,
                               destination_file_path = results_path,
                               destination_file_name_prefix = '_User_level_',
                                analysis_name = analysis_name,
                                omit_code = '.',
                                not_seen_code = '-99',
                                use_display_order = True,
                               qbank = True)

Finished creating matrix for Item_Scores
Finished creating matrix for Responses
Finished creating matrix for Milliseconds_per_Item


# Metadata

## Sequence Level Information

In [22]:
activity_level_info = make_activity_level_info(df = result,
                                              results_path = results_path,
                                              analysis_name = analysis_name)

In [23]:
#filtering for latest exam completed instance
idx = activity_level_info.groupby(by =['studentId'])['dateCompleted'].transform('max') == activity_level_info['dateCompleted']

activity_level_info = activity_level_info[idx]

activity_level_info.to_csv(results_path+analysis_name+'_activity_Level_Info.csv', index = False)

In [39]:
#null_users = activity_level_info[(activity_level_info['template_pPlus'].isnull()) | (activity_level_info['template_pPlus']==0)]['studentId'].unique()

In [32]:
#a = response_df[response_df['jasperUserId']==1503247724][['jasperUserId', 'kbsEnrollmentId', 'sequenceId', 'contentItemName', 'response', 'responseStatus', 'score', 'scored']]

In [33]:
#a[a['response'].notnull()].sort_values(by = ['contentItemName'])

Unnamed: 0,jasperUserId,kbsEnrollmentId,sequenceId,contentItemName,response,responseStatus,score,scored
441913,1503247724,1305066961,5f94d471-7248-4632-ad28-f991b325cebd,cq0262m,3.0,responded,0.0,1
294786,1503247724,1305066961,5f94d471-7248-4632-ad28-f991b325cebd,cq0269_10,1.0,responded,0.0,1
374278,1503247724,1305066961,5f94d471-7248-4632-ad28-f991b325cebd,cq0289_10,1.0,responded,0.0,1
442822,1503247724,1305066961,5f94d471-7248-4632-ad28-f991b325cebd,cq0808_10,3.0,responded,1.0,1


In [34]:
#result[result['studentId']==1503247724][['studentId', 'activityId', 'contentItemName', 'response', 'responseStatus', 'score', 'scored', 'attempted']].sort_values(by = ['contentItemName'])

Unnamed: 0,studentId,activityId,contentItemName,response,responseStatus,score,scored,attempted
674,1503247724,8ffc5f18-0d55-4102-a082-04fc9f6f9c7f,cq0026_10,,omitted,,1,False
671,1503247724,8ffc5f18-0d55-4102-a082-04fc9f6f9c7f,cq0027m_10,,omitted,,1,False
692,1503247724,aee28f65-76db-4491-b3e8-9c8a78f48240,cq0047m,,omitted,,1,False
688,1503247724,aee28f65-76db-4491-b3e8-9c8a78f48240,cq0049_10,,omitted,,1,False
695,1503247724,f5fb7512-394a-4676-bdcf-6e3037e1f7fd,cq0124m_10,,not-reached,,1,False
...,...,...,...,...,...,...,...,...
650,1503247724,320bc251-40c7-45cc-b0bd-7fc5ae86b8b9,cq1512,,not-reached,,1,False
652,1503247724,320bc251-40c7-45cc-b0bd-7fc5ae86b8b9,cq1513,,not-reached,,1,False
690,1503247724,aee28f65-76db-4491-b3e8-9c8a78f48240,cq1514,,omitted,,1,False
649,1503247724,320bc251-40c7-45cc-b0bd-7fc5ae86b8b9,cq1515,,not-reached,,1,False


In [48]:
#activity_level_info[activity_level_info['template_pPlus'].isnull()]

Unnamed: 0,studentId,activityId,dateCreated,dateCompleted,activityName,template_num_attempted,template_raw_correct,template_pTotal,template_pPlus


In [24]:
activity_level_info

Unnamed: 0,studentId,activityId,dateCreated,dateCompleted,activityName,template_num_attempted,template_raw_correct,template_pTotal,template_pPlus
14,41279031,6e900575-68b4-401b-b5ce-9e417805a455,2021-11-25 02:51:17,2021-11-25 03:03:09,usmle-s2-qbank-sequence-recipe,4,2.0,0.400000,0.500000
34,41529415,4951b235-c01c-4d95-8196-e6842f35683d,2022-01-30 23:33:32,2022-01-31 04:52:20,usmle-s2-qbank-sequence-recipe,26,10.0,0.250000,0.384615
198,41808239,3ee5bfe2-9b48-42e7-88cd-612d2eace382,2022-01-15 20:35:23,2022-01-15 21:55:00,usmle-s2-qbank-sequence-recipe,25,24.0,0.923077,0.960000
280,41810956,0ae962a3-8ff6-432c-8bc4-de542a0f49e1,2022-01-26 20:08:11,2022-01-27 21:06:35,usmle-s2-qbank-sequence-recipe,5,4.0,0.133333,0.800000
670,41817566,ea474abe-05c8-4c9a-ad0b-834b127d8ff1,2021-06-27 17:24:37,2021-06-27 18:03:52,usmle-s2-qbank-sequence-recipe,40,16.0,0.400000,0.400000
...,...,...,...,...,...,...,...,...,...
996730,1833763866,6bc05f3c-81e0-433f-902f-910ab051664f,2022-02-25 03:32:20,2022-02-25 03:47:29,usmle-s2-qbank-sequence-recipe,10,5.0,0.500000,0.500000
996777,1833777296,8386c558-22d3-4aef-96b9-7690ced16c91,2022-02-27 16:07:51,2022-02-27 16:08:56,usmle-s2-qbank-sequence-recipe,0,0.0,0.000000,
996790,1833779387,5e1a0fe1-5ee7-45b0-a754-cf56f947af5f,2022-02-28 19:36:03,2022-02-28 19:49:12,usmle-s2-qbank-sequence-recipe,10,3.0,0.300000,0.300000
996868,1833782532,fa308ae7-d57a-4d9e-811d-5c9e713579cd,2022-02-28 19:24:34,2022-02-28 19:36:01,usmle-s2-qbank-sequence-recipe,10,5.0,0.500000,0.500000


## User Level Information

In [25]:
user_info = make_user_level_info(df = result,
                                results_path = results_path,
                                analysis_name = analysis_name,
                                test_map = test_map,
                                qbank = True)

In [26]:
user_info

activityName,studentId,num_panel_tests_taken,total_panel,total_seen,total_att,total_correct,panel_ptotal,panel_pplus,incl_usmle-s2-qbank-sequence-recipe
0,41279031,1,5,19,18,10.0,0.526316,0.555556,1
1,41529415,1,15,168,147,63.0,0.375000,0.428571,1
2,41808239,1,40,77,76,52.0,0.675325,0.684211,1
3,41810956,1,10,396,387,172.0,0.434343,0.444444,1
4,41817566,1,10,40,40,16.0,0.400000,0.400000,1
...,...,...,...,...,...,...,...,...,...
3490,1833763866,1,15,9,9,5.0,0.555556,0.555556,1
3491,1833777296,1,17,1,0,0.0,0.000000,,1
3492,1833779387,1,40,8,8,2.0,0.250000,0.250000,1
3493,1833782532,1,20,87,86,41.0,0.471264,0.476744,1


In [19]:
null_users = user_info[user_info['total_seen']==0]['studentId'].unique()
pd.DataFrame(data = null_users, columns = ['studentId']).to_csv(results_path+analysis_name+'_allItems_notseen.csv', index = False)

## Item Level Information

In [38]:
item_level_info = make_item_level_info(df = result,
                                      content_df = content_df,
                                      results_path = results_path,
                                      analysis_name = analysis_name,
                                      corr_ans = cor_ans)

In [34]:
df = result.copy()
df = df[df['repeatOmitted']==False].copy()
#making a seen field for an item to make count_seen
df['itemSeen'] = df['responseStatus']!='not-reached'

cidf_summary = df.sort_values(by=['displaySeq']).groupby(by=['contentItemName', 'templateId'], as_index=False, dropna = False).agg(displaySeq = ('displaySeq', lambda x: ', '.join(x.drop_duplicates().astype(str))),
                                                                                                                                  count_att = ('attempted', 'sum'),
                                                                                                                                                   count_seen = ('itemSeen', 'sum'),
                                                                                                                                                   num_correct = ('score', 'sum'),
                                                                                                                                                   first_date = ('dateCreated', 'min'),
                                                                                                                                                   last_date = ('dateCreated', 'max'))
        

cidf_summary = pd.merge(cidf_summary, content_df, how = 'left')
        
cidf_summary = cidf_summary[['contentItemName',
                             'templateId',
                            'displaySeq',
                             'count_att',
                             'count_seen',
                             'num_correct',
                             'first_date',
                             'last_date',
                             'interactiontypename',
                             'countchoices',
                             'correctAnswer']].drop_duplicates().sort_values(by=['displaySeq'], ignore_index = True)
        
cidf_summary = pd.merge(cidf_summary.drop(columns = ['correctAnswer', 'displaySeq']), cor_ans, on = ['contentItemName'])
cidf_summary.to_csv(results_path+analysis_name+'_Content_Item_Info.csv', index = False)

In [27]:
#Testing
df = result.copy()
df = df[df['repeatOmitted']==False].copy()
#making a seen field for an item to make count_seen
df['itemSeen'] = df['responseStatus']!='not-reached'

cidf_summary = df.sort_values(by=['displaySeq']).groupby(by=['contentItemName', 'templateId'], as_index=False, dropna = False).agg(count_att = ('attempted', 'sum'),
                                                                                                                                                   count_seen = ('itemSeen', 'sum'),
                                                                                                                                                   num_correct = ('score', 'sum'),
                                                                                                                                                   first_date = ('dateCreated', 'min'),
                                                                                                                                                   last_date = ('dateCreated', 'max'))

cidf_summary = pd.merge(cidf_summary, cor_ans, how = 'left', on = ['contentItemName'])

#for count choices
cidf_summary = pd.merge(cidf_summary, content_df[['contentItemId', 'contentItemName', 'countchoices']], on = ['contentItemId', 'contentItemName'], how = 'inner')

#re-arranging columns
cidf_summary = cidf_summary[['contentItemId', 'contentItemName', 'templateId', 'displaySeq', 'count_seen', 'count_att', 'num_correct', 'correctAnswer', 'countchoices', 'first_date', 'last_date']]

cidf_summary.to_csv(results_path+analysis_name+'_Content_Item_Info.csv', index = False)

In [28]:
cidf_summary

Unnamed: 0,contentItemId,contentItemName,templateId,displaySeq,count_seen,count_att,num_correct,correctAnswer,countchoices,first_date,last_date
0,277b0971-45a5-4772-a6fc-afd96b82445f,imq07m,,23,274,252,183.0,3.0,5,2021-01-01 18:38:30,2022-02-25 17:23:24
1,ff341176-d7ec-4eb1-8020-4bbbfd93016e,imq100,,33,415,386,236.0,3.0,5,2021-01-02 20:05:55,2022-03-01 01:25:12
2,1153e3dd-9b39-47dc-9da5-7c7635bd769a,imq110m,,9,279,255,169.0,5.0,5,2021-01-04 15:17:04,2022-02-28 15:42:03
3,0a258cfb-84a8-4771-b347-9bb5dbb1ec2a,imq123m,,23,371,348,222.0,4.0,5,2021-01-03 14:45:00,2022-02-26 01:10:14
4,fae8b03a-ae4a-440e-a4d5-ca24dcf0d7c8,imq133m,,12,298,271,147.0,5.0,5,2020-11-19 18:42:49,2022-02-28 00:37:17
...,...,...,...,...,...,...,...,...,...,...,...
2091,8e962327-f18c-4adb-a895-fceccb645a31,s2s381,,7,514,490,71.0,7.0,7,2021-01-04 04:21:58,2022-02-26 21:52:47
2092,9da1bec6-d847-4d3c-bfe6-31f7e2ea6d12,s2s382,,27,509,463,71.0,3.0,5,2021-01-02 18:43:23,2022-03-01 20:25:51
2093,d5760dfd-8c73-4f01-996d-8700ce986f0e,s2s383m,,4,572,529,492.0,3.0,5,2020-12-18 19:14:58,2022-03-01 20:11:27
2094,f79ba989-bcf4-4b95-a487-d6704effff9d,s2s384,,25,402,374,88.0,3.0,5,2021-01-01 22:11:55,2022-03-01 22:40:30


# Making ncounts file for items

In [33]:
raw_ft_items_counts = pd.merge(response_df[['contentItemName', 'jasperUserId']].drop_duplicates(), matched_items, on=['contentItemName'], how = 'right', indicator=True).groupby(by = ['contentItemName'], as_index = False, dropna=False).agg(initial_count = ('jasperUserId', 'nunique'))

raw_ft_items_counts = pd.merge(raw_ft_items_counts, cidf_summary[['contentItemName', 'count_att']].drop_duplicates(), on = ['contentItemName'], how = 'left')
raw_ft_items_counts.rename(columns = {'count_att':'after_cleaning'}, inplace = True)

raw_ft_items_counts.to_csv(results_path + program + '_raw_FT_items_Counts.csv', index = False)

In [34]:
raw_ft_items_counts

Unnamed: 0,contentItemName,initial_count,after_cleaning
0,imq07m,289,252
1,imq100,433,386
2,imq110m,295,255
3,imq123m,389,348
4,imq133m,317,271
...,...,...,...
2091,s2s381,535,490
2092,s2s382,535,463
2093,s2s383m,602,529
2094,s2s384,421,374
