In [1]:
import numpy as np
import pandas as pd

from knight.salvador import db_con,describe_dupe_cor_ans, get_item_cor_ans, recode_as_omitted, timing_exclusion, remove_repeat_questions, combine_CIinfo, removed_record_count, clean_item_data
from knight.salvador import make_user_level_matrices, make_item_level_info, make_activity_level_info, make_user_level_info

from knight.salvador import color, teleg_msg, merge_size, dt_counts, ch_dtype, resp_cleaning

In [2]:
results_path = 'C:\\Users\\VImmadisetty\\Downloads\\DATA\\DRCR\\Practical Nursing (PN)\\Results\\Psychosocial\\'

data_path = 'C:\\Users\\VImmadisetty\\Downloads\\DATA\\DRCR\\Practical Nursing (PN)\\Data\\Psychosocial\\'

analysis_name = 'Psychosocial B'

# Reading the data

In [3]:
activity_df = pd.read_csv(data_path + 'activity_info.tsv', sep='\t', parse_dates=['timestamp_created', 'timestamp_completed']).drop_duplicates(ignore_index=True)
response_df = pd.read_csv(data_path + 'response_data.tsv', sep='\t')
content_df = pd.read_csv(data_path + 'content_info.tsv', sep='\t', parse_dates = ['last_modified'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
print('Shape of activity_df :', activity_df.shape)
print('Total Students : ', activity_df['student_id'].nunique())
print('Total Activities : ', activity_df['activity_id'].nunique())
print('Total Enrollments : ', activity_df['enrollment_id'].nunique())
print('Template Id : ', activity_df['template_id'].unique())

print('-'*50)

print('Shape of response_df :', response_df.shape)
print('Total Students : ', response_df['student_id'].nunique())
print('Total Activities : ', response_df['activity_id'].nunique())
print('Total Content Items : ', response_df['content_item_id'].nunique())

print('-'*50)

print('Shape of content_df :', content_df.shape)
print('Total Content Item Ids : ', content_df['content_item_id'].nunique())
print('Total Content Item Names : ', content_df['content_item_name'].nunique())
print('Is Correct_answer column available? ', 'correct_answer' in content_df.columns)
#print('Data available in correct answer columns :', content_df['correct_answer'].nunique())

Shape of activity_df : (2272, 25)
Total Students :  2272
Total Activities :  2272
Total Enrollments :  2272
Template Id :  [  nan 4370.]
--------------------------------------------------
Shape of response_df : (159064, 20)
Total Students :  2272
Total Activities :  2273
Total Content Items :  259
--------------------------------------------------
Shape of content_df : (119, 45)
Total Content Item Ids :  119
Total Content Item Names :  76
Is Correct_answer column available?  True


# Data handling

In [5]:
response_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'jasperUserId',
                            'activity_id' : 'sequenceId',
                            'item_position': 'position',
                            'section_title' : 'sectionTitle',
                            'item_section_position' : 'displaySeq',
                            'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction_type': 'interactionType',
                            'milliseconds_used' : 'mSecUsed',
                            'is_scored' : 'scored',
                            'scored_response' : 'score',
                            'raw_response' : 'response',
                            'field_test' : 'fieldTest',
                           'item_status': 'responseStatus'}, inplace = True)
#response_df['correctAnswer'] = np.nan
response_df['sectionName'] = response_df['sectionTitle']


In [6]:
#userids are int64 in activity df and string in response df so we are removing student ids that are actual strings
#from response df and merging with activity df
numers = [x for x in response_df['jasperUserId'].unique() if (type(x) == int or type(x) == float or type(x) == np.int64 or type(x) == np.float64 or x.isnumeric()) and not pd.isna(x)]

response_df = response_df[response_df['jasperUserId'].isin(numers)].copy()

In [7]:
#Below to avoid issues while merging with activity_df
#below will change the string type of ints to ints
response_df['jasperUserId'] = response_df['jasperUserId'].apply(lambda x: ch_dtype(x))
response_df['sequenceId'] = response_df['sequenceId'].apply(lambda x: ch_dtype(x))
response_df['historiesDb'] = response_df['historiesDb'].apply(lambda x: ch_dtype(x))

In [8]:
activity_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'jasperUserId',
                            'enrollment_id' : 'kbsEnrollmentId',
                            'activity_id' : 'sequenceId',
                            'template_id' : 'templateId',
                            'template_name' : 'sequenceName',
                            'sequence_title' : 'sequenceTitle',
                            'timestamp_created' : 'dateCreated',
                            'timestamp_completed' : 'dateCompleted',
                           'tutor_mode' : 'tutorMode',
                           'status':'sequenceStatus'}, inplace = True)

activity_df['jasperUserId'] = activity_df['jasperUserId'].apply(lambda x: ch_dtype(x))
activity_df['sequenceId'] = activity_df['sequenceId'].apply(lambda x: ch_dtype(x))
activity_df['historiesDb'] = activity_df['historiesDb'].apply(lambda x: ch_dtype(x))

#merging responses and activity data and changing the names of statuses
print('Response_df size before merging with activity_df :', response_df.shape)
initi_cols = response_df.columns
response_df = pd.merge(response_df.drop(columns=['source_system']), activity_df.drop(columns=['source_system']), on = ['jasperUserId', 'sequenceId', 'historiesDb'], how = 'inner')

print('Response_df size after merging with activity_df:', response_df.shape)
print('\nNew columns added from activity_df with activity_df:\n', set(response_df)^set(initi_cols))

Response_df size before merging with activity_df : (159064, 21)
Response_df size after merging with activity_df: (159064, 41)

New columns added from activity_df with activity_df:
 {'program', 'total_scored_items', 'total_items_remediated', 'activity_type', 'source_system', 'total_items_remediation_time', 'sequenceStatus', 'sequenceName', 'total_scored_items_time_elapsed', 'total_scored_items_answered_correct', 'alternate_timing', 'total_scored_items_omitted', 'dateCompleted', 'kbsEnrollmentId', 'activity_title', 'dateCreated', 'product_code', 'tutorMode', 'total_scored_items_not_reached', 'templateId', 'total_scored_items_answered', 'product_name'}


In [9]:
#changing kbs_enrollment_id to string as in cleaning function to remove 0 kbseids we are comparing as string
response_df['kbsEnrollmentId'].fillna(value = 0, inplace = True)
response_df['kbsEnrollmentId'] = response_df['kbsEnrollmentId'].astype('str')

* **Reading ft items data**

In [10]:
ft_items = pd.read_csv('C://Users//VImmadisetty//Downloads//DATA//DRCR/Practical Nursing (PN)\\PN_FT_items.csv')

ft_items = ft_items[[analysis_name]]

ft_items = ft_items[ft_items[analysis_name].notnull()]

In [11]:
ft_items

Unnamed: 0,Psychosocial B
0,n022267
1,n022268
2,n022269
3,n022270
4,n022271
...,...
65,n022344
66,n022345
67,n022346
68,n022347


In [12]:
content_df = content_df[['source_system', 'content_item_id', 'content_item_name', 'content_item_type', 'interaction_type_name', 'count_choices', 'correct_answer', 'last_modified']].copy()

#creating item rank based on last modified to get latest item's item_id
content_df['item_rank'] = content_df.groupby(['content_item_name'])['last_modified'].rank(method = 'first', ascending=False)

#extracting count_choices and correct_answer on item_name if they are same at all in each group
#fortunately they are all same
item_metadata = content_df.groupby(['content_item_name'], as_index = False).agg(count_choices = ('count_choices', lambda x: list(np.unique(x))[0]),
                                                                correct_answer = ('correct_answer', lambda x: list(np.unique(x))[0]))

#merging with meta data
content_df = pd.merge(content_df[content_df['item_rank']==1].drop(columns=['count_choices', 'correct_answer', 'item_rank']), item_metadata, on = ['content_item_name'])

#marking item as ft
#content_df = pd.merge(content_df, ft_items, how = 'left', on = ['content_item_name'])
#content_df['itemStatus'].fillna(value = 'Standard Item', inplace = True)

#content_df['itemStatusRank'] = content_df['itemStatus'].replace(itemStatusRank)

In [13]:
content_df.rename(columns = {'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction_type_name' : 'interactiontypename',
                            'count_choices' : 'countchoices',
                            'parent_item_id' : 'parentid',
                            'parent_item_name' : 'parentname',
                            'correct_answer':'correctAnswer'}, inplace = True)

#category name vary based on problem, so check the data
#merging with response_df
resp_bef_size = response_df.shape[0]
print('shape of response_df before merging with content_df :', response_df.shape)

response_df = pd.merge(response_df, content_df.drop(columns = ['contentItemId']),
                       how = 'inner',
                      on = ['contentItemName'])
resp_aft_size = response_df.shape[0]

print('shape of response_df after merging with content_df :', response_df.shape)

if (resp_bef_size != resp_aft_size):
    print(color.BOLD + 'Size of response df is not same after merging with content df' + color.END)

shape of response_df before merging with content_df : (159064, 41)
shape of response_df after merging with content_df : (159064, 47)


In [14]:
#cleaning and sorting raw_response based on interaction-type
response_df['interactionType'] = response_df['interactionType'].str.lower()
response_df.loc[~(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in'])), 'response'] = response_df[~(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in']))]['response'].apply(lambda x: resp_cleaning(x) if not pd.isna(x) else x)

response_df.loc[(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in'])), 'response'] = response_df[(response_df['interactionType'].isin(['order-interaction', 'order-match', 'text-entry-interaction', 'numerical fill- in']))]['response'].apply(lambda x: x.strip() if not pd.isna(x) else x)

In [15]:
#renaming sequence names, as they are inconsistent
response_df.loc[response_df['sequenceName']== 'pn-psychosocial-b', 'sequenceName'] = 'PN Psychosocial B'


In [16]:
#summary to from response df to make test map
response_summary = response_df[response_df['sequenceStatus'].str.lower() == 'completed'].groupby(['templateId', 'sequenceName', 'jasperUserId'], as_index = False, dropna = False).agg(num_responses = ('contentItemName', 'nunique'))\
.groupby(['templateId', 'sequenceName'], dropna = False, as_index = False).agg(min_resp = ('num_responses', 'min'),
                                            median_resp = ('num_responses', 'median'),
                                            max_resp = ('num_responses', 'max'),
                                            num_users = ('num_responses', 'count'))

response_summary

Unnamed: 0,templateId,sequenceName,min_resp,median_resp,max_resp,num_users
0,4370.0,PN Psychosocial B,70,70,70,2193
1,,PN Psychosocial B,70,70,76,79


In [17]:
#use activity_title col from response_df to know jasperSequenceName

test_map = pd.DataFrame()
test_map['templateId'] = [np.nan]
test_map['jasperSequenceName'] = ['PN Psychosocial B']

test_map['sectionName'] = [np.nan]

test_map['numQues'] = [70]
test_map['responseThreshold'] = 0.75
test_map['minutesAllowed'] = [84]

test_map

Unnamed: 0,templateId,jasperSequenceName,sectionName,numQues,responseThreshold,minutesAllowed
0,,PN Psychosocial B,,70,0.75,84


# Connecting to Db for Repeats enrolls

In [34]:
try:
    engine
except:
    engine = db_con()

query_frt = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    lower(prd.product_subtype) in ('free trial');
    """

try:
    frt_enrols
except:
    frt_enrols = pd.read_sql(sql = query_frt, con = engine)

query_olc = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    lower(prd.product_subtype) in ('online companion')
    ;"""

try:
    olc_enrols
except:
    olc_enrols = pd.read_sql(sql = query_olc, con = engine)

query_hsg_repeat = """select
    distinct ph.id kbsenrollmentid
from
    kbs_billing.purchase_history ph
    join bi_reporting.vw_product_detail prd on ph.product_id = prd.product_id
where
    ph.initial_delta_k_txn_code in ('404', '405', '406');
    --and ph.created_on >= '2018-01-01';
"""

try:
    repeaters
except:
    repeaters = pd.read_sql(sql = query_hsg_repeat, con = engine)
    repeaters['kbsenrollmentid'] = repeaters['kbsenrollmentid'].astype('str')

Enter db username : ········
Enter db password : ········
Enter database : redshiftapps


# Cleaning the data

In [18]:
#If we know the correct answers for all the items in response_df and correct answer column in response_df is not null
#then we can use no_correctAnswer = False
#and by default we work on contentItemId to get multiple correct answers
#we get correct answers df here, we need to filter items that are having multiple versions of items

cor_ans, dup_cor_ans, dup_ca_cnt = get_item_cor_ans(response_df, no_correctAnswer = True, use_contentItemName=True)

  contentItemName response  score_count                min_date  \
0         n022331  1,2,3,4            6 2016-03-30 14:19:27.500   
1         n022331  2,4,1,3          721 2015-10-05 10:16:56.953   

                 max_date  
0 2016-09-15 13:49:23.570  
1 2021-09-28 22:07:29.048  
[1mDuplicate correct answers for some items[0m


In [19]:
cor_ans = cor_ans[['contentItemName', 'correctAnswer']].drop_duplicates(ignore_index = True)
cor_ans['flag'] = 1
cor_ans.loc[(cor_ans['contentItemName']=='n022331') & (cor_ans['correctAnswer']=='1,2,3,4'), 'flag'] = 0
cor_ans = cor_ans[cor_ans['flag']==1][['contentItemName', 'correctAnswer']].drop_duplicates(ignore_index = True)
cor_ans.rename(columns = {'correctAnswer' : 'corans_cor'}, inplace = True)

In [36]:
CI_old_keys = pd.DataFrame()
CI_old_keys['contentItemId'] = ['n022331']
CI_old_keys['correctAnswer'] = ['1,2,3,4']
CI_old_keys

Unnamed: 0,contentItemId,correctAnswer
0,n022331,1234


## Running cleaning function

In [37]:
data_pool = {'CI_old_version_dates' : pd.DataFrame(),
'CI_old_version_list' : pd.DataFrame(),
'CI_old_keys' :  CI_old_keys,
'frt_enrols' : frt_enrols,
'olc_enrols' : olc_enrols,
'repeaters' : repeaters,
'section_map' : pd.DataFrame(),
'test_map' : test_map,
'seqHist_to_exclude' :  pd.DataFrame(),
'cidf' : pd.DataFrame(),
'field_test_items' : pd.DataFrame(),
'ci_cols_to_include' : pd.DataFrame(),
'interaction_type_list' : pd.DataFrame()}

#field_test_items should have column name'contentItemName' to match with response_df

In [38]:
result, cleaning_info, rejects_df = clean_item_data(data_path = data_path,
                         results_path = results_path,
                        analysis_name = analysis_name,
                         resp = response_df,
                         remove_users_deleted_sequences = True,
                         remove_dup_CIs = True,
                         remove_no_kbsEID = True,
                         remove_deleted_sequences = True,
                         remove_impo_response_scored = True,
                         remove_impo_timing_seq = True,
                         remove_seq_w_tmq = True,
                         remove_staged_responses = False,
                         remove_FT_items = False,
                        data_pool = data_pool,
                         CI_remove_before_after = 'before', #applicable for old version dates
                         repeat_treatment = 'omit', #default omit
                         mSec_min_threshold = 5000, #there's y/n condition for timing if none provided
                         mSec_max_threshold = None, #so if condition is not needed.
                         sec_min_threshold = None,
                         sec_max_threshold = None,
                         remove_frt_users = True,
                         remove_olc_users = True,
                         remove_repeat_enrolls = True,
                         remove_tutor = True,
                         remove_ada_seq = True,
                         remove_untimed_seq = True,
                         remove_incomplete_seq = False,
                         seq_item_minutes_threshold = None, #input in terms of minutes here
                         seq_section_minutes_threshold = None, #input in terms of minutes here
                         seq_total_minutes_threshold = 108, #input in terms of minutes here
                         qbank = False, #if qbank==false and section_map is not empty then sec_num_attempted is validated against given min_items_per_seq from secton_map
                         min_itmes_per_seq = None, #if qbank==true and section_map provided, sequence_num_attempted is validated against given min_items_per_seq
                         section_calc = True, #if section map provided but min_item_per_seq column not present in it & qbank == false then this gets activated for validating sec_perc_attempted vs section_resp_threshold(comes from section_map)
                         #test_map = test_map, #if section_map is empty this gets activated, and qbank==false then test_resp_threshold comes from test_map
                         #seq_item_resp_threshold = .75,
                         remove_unscored = False,
                         #seqHist_to_exclude = pd.DataFrame(), #should have column sequenceId in it
                         precombined_files = True, #used to add columns from cidf to response_df
                         #cidf = pd.DataFrame(), #pass cidf
                         #interaction_type_list = [], #pass interactionTypeIds in this list
                         #ci_cols_to_include = [], #pass columns names of cidf to include in response_df (ciname, ciid by default added if precombined files is True)
                         remove_repeat_test_administrations = True,
                        remove_seq_wo_dispseq = True)
    

Starting clean item data function at 2022-04-01 20:17:27
Total sequences at start:  2272
Total users at start:  2272
Unique items at start:  76
Total responses at start:  159064

Working on Disqualifiers :
User Removal
--------------------
Sequences with deleted names are removed: NONE

Sequences with dupe content items within the same exam, removed: NONE

Sequences with no KBS EID removed: NONE

Sequence Removal
--------------------
Sequences with deleted names are removed: NONE

Sequences with bad records (response = 0 with score = 1), removed: NONE

Sequences with bad timing (mSecUsed < 0), removed: NONE

Here are the new columns after joining all the test and section maps
{'test_response_threshold', 'test_num_ques', 'test_minutes_allowed'}

Sequences with too many questions in a section, removed:  4
Users removed  4
Unique items removed  6
Responses removed  304

Item Removal
--------------------
Total item responses:  158760 

Working on Cleaning Rules :
Response / Score Re-coding

In [39]:
result

Unnamed: 0,historiesDb,studentId,activityId,position,sectionTitle,displaySeq,contentItemId,contentItemName,interactionType,mSecUsed,...,attempted,orig_response,orig_score,repeatOmitted,template_num_omitted,template_num_attempted,template_perc_attempted,template_raw_correct,template_pTotal,template_pPlus
0,300,56913984,328754963,1,,1,19923,n022319,"multiple-choice,single-best-answer",15000.0,...,True,4,1.0,False,0,70,1.0,51.0,0.728571,0.728571
1,300,1527701209,328803546,33,,33,19923,n022319,"multiple-choice,single-best-answer",39000.0,...,True,4,1.0,False,0,70,1.0,39.0,0.557143,0.557143
2,300,56396126,328804456,52,,52,19923,n022319,"multiple-choice,single-best-answer",22000.0,...,True,4,1.0,False,0,70,1.0,43.0,0.614286,0.614286
3,300,1529454011,328836168,70,,70,19923,n022319,"multiple-choice,single-best-answer",98000.0,...,True,4,1.0,False,0,70,1.0,36.0,0.514286,0.514286
4,300,1528584062,328847245,1,,1,19923,n022319,"multiple-choice,single-best-answer",60000.0,...,True,4,1.0,False,0,70,1.0,43.0,0.614286,0.614286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159035,200,1831602188,e998ec11-14c2-4f09-849d-7400dcd3710d,62,section1,62,aa5c6a8a-a69e-4f83-8849-1736b3d390b7,n022326,choice-interaction,,...,True,2,1.0,False,0,70,1.0,37.0,0.528571,0.528571
159036,200,1830312934,eb876a13-13ba-42d0-8635-c569c7a7d00f,45,section1,45,aa5c6a8a-a69e-4f83-8849-1736b3d390b7,n022326,choice-interaction,,...,True,2,1.0,False,0,70,1.0,45.0,0.642857,0.642857
159037,200,1832250818,ede1a194-59cf-4bec-8324-975b03a943d7,49,section1,49,aa5c6a8a-a69e-4f83-8849-1736b3d390b7,n022326,choice-interaction,,...,True,2,1.0,False,0,70,1.0,44.0,0.628571,0.628571
159038,200,1832483064,ee7cacf8-67bc-49c6-8e09-8a8c7e413136,70,section1,70,aa5c6a8a-a69e-4f83-8849-1736b3d390b7,n022326,choice-interaction,,...,True,3,0.0,False,0,70,1.0,42.0,0.600000,0.600000


## Exporting cleaning info & cleaned response_data

In [20]:
#Exporting cleaned response_data
result = result.sort_values(by=['studentId', 'activityId', 'displaySeq'], ignore_index= True)
result.to_csv(results_path+analysis_name+'_responseData.csv', index = False)

#Exporting cleaning info
cleaning_info.to_csv(results_path+analysis_name+'_cleaningInfo.csv')

#Exporting rejects info
rejects_df.sort_values(by=['studentId', 'templateId'], ignore_index = True).to_csv(results_path+analysis_name+'_rejects_info.csv', index = False)

NameError: name 'result' is not defined

In [21]:
result = pd.read_csv(results_path+analysis_name+'_responseData.csv', parse_dates=['dateCreated', 'dateCompleted'])


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Creating User Level Matrices

In [41]:
#Reading cleaned data
all_resp = result.copy()
#all_resp.rename(columns={'userByForm':'studentId', 'itemByForm':'contentItemName'}, inplace = True)

all_resp = all_resp.sort_values(by=['activityName', 'displaySeq'])

#col_name:filename
vars_for_matrix =  {'score':'Item_Scores',
                   'response':'Responses',
                   'mSecUsed':'Milliseconds_per_Item'}

#making matrices
big_matrix = make_user_level_matrices(all_resp,
                               vars_for_matrices = vars_for_matrix,
                               destination_file_path = results_path,
                               destination_file_name_prefix = '_User_level_',
                                analysis_name = analysis_name,
                                omit_code = '.',
                                not_seen_code = '-99',
                                use_display_order = False,
                               qbank = False,
                               # item_order_list = all_resp[['contentItemName', 'itemStatusRank']].drop_duplicates().sort_values(by=['itemStatusRank'])['contentItemName']
                                     )

Finished creating matrix for Item_Scores
Finished creating matrix for Responses
Finished creating matrix for Milliseconds_per_Item


# Metadata

## Sequence Level Information

In [42]:
activity_level_info = make_activity_level_info(df = all_resp,
                                              results_path = results_path,
                                              analysis_name = analysis_name)

In [43]:
activity_level_info

Unnamed: 0,studentId,activityId,dateCreated,dateCompleted,activityName,template_num_attempted,template_raw_correct,template_pTotal,template_pPlus
0,56396126,328804456,2015-10-05 10:16:56.953,2015-10-05 11:00:23.897,PN Psychosocial B,70,43.0,0.614286,0.614286
70,56453699,335362866,2017-04-20 16:23:46.957,2017-04-20 17:21:04.783,PN Psychosocial B,70,43.0,0.614286,0.614286
140,56722365,331834560,2016-05-31 11:10:32.003,2016-05-31 13:40:48.357,PN Psychosocial B,69,36.0,0.514286,0.521739
210,56735292,335047991,2017-03-28 09:53:39.303,2017-03-28 10:53:09.017,PN Psychosocial B,70,46.0,0.657143,0.657143
280,56746309,335081960,2017-03-30 18:00:40.820,2017-04-19 11:16:41.643,PN Psychosocial B,70,47.0,0.671429,0.671429
...,...,...,...,...,...,...,...,...,...
155750,1832783081,79dd6c9e-42cf-48f6-b473-7deb7e549042,2021-06-14 19:01:38.623,2021-06-14 19:33:48.722,PN Psychosocial B,70,48.0,0.685714,0.685714
155820,1832783089,48bbe90f-6cb3-4fe3-b79f-48a63048a038,2021-06-14 18:03:59.340,2021-06-14 18:39:40.112,PN Psychosocial B,70,44.0,0.628571,0.628571
155890,1832783095,90c717a0-b70a-4fee-8a33-ae6fdf1b4c6c,2021-06-15 14:28:37.298,2021-06-15 15:26:45.991,PN Psychosocial B,70,41.0,0.585714,0.585714
155960,1833070787,ac2d1aad-99bc-4f6c-a8ad-7a65bc0d6474,2021-06-14 18:04:29.535,2021-06-14 18:52:03.130,PN Psychosocial B,70,43.0,0.614286,0.614286


## User Level Information

In [44]:
user_info = make_user_level_info(df = all_resp,
                                results_path = results_path,
                                analysis_name = analysis_name,
                                test_map = test_map)

In [45]:
user_info

activityName,studentId,num_panel_tests_taken,total_panel,total_seen,total_att,total_correct,panel_ptotal,panel_pplus,incl_PN Psychosocial B
0,56396126,1,70,70,70,43.0,0.614286,0.614286,1
1,56453699,1,70,70,70,43.0,0.614286,0.614286,1
2,56722365,1,70,70,69,36.0,0.514286,0.521739,1
3,56735292,1,70,70,70,46.0,0.657143,0.657143,1
4,56746309,1,70,70,70,47.0,0.671429,0.671429,1
...,...,...,...,...,...,...,...,...,...
2225,1832783081,1,70,70,70,48.0,0.685714,0.685714,1
2226,1832783089,1,70,70,70,44.0,0.628571,0.628571,1
2227,1832783095,1,70,70,70,41.0,0.585714,0.585714,1
2228,1833070787,1,70,70,70,43.0,0.614286,0.614286,1


## Item Level Information

In [22]:
all_resp = result.copy()

In [24]:
df = all_resp.copy()
df = df[df['repeatOmitted']==False].copy()
#making a seen field for an item to make count_seen
df['itemSeen'] = df['responseStatus']!='not-reached'

cidf_summary = df.sort_values(by=['displaySeq']).groupby(by=['contentItemName'], as_index=False, dropna = False).agg(displaySeq = ('displaySeq', lambda x: ', '.join(x.drop_duplicates().astype(str))),
                                                                                                                                  count_att = ('attempted', 'sum'),
                                                                                                                                                   count_seen = ('itemSeen', 'sum'),
                                                                                                                                                   num_correct = ('score', 'sum'),
                                                                                                                                                   first_date = ('dateCreated', 'min'),
                                                                                                                                                   last_date = ('dateCreated', 'max'))
        
df['latestItem'] = df.groupby(['contentItemName'])['dateCreated'].rank(method = 'first', ascending = False)

itemMetaData = df[df['latestItem']==1][['contentItemName','interactionType',
                             'countchoices',
                             'correctAnswer']].copy()

cidf_summary = pd.merge(cidf_summary, itemMetaData, on = ['contentItemName'], how = 'inner')

#cidf_summary['activityName'] = analysis_name + ' ' + cidf_summary['itemFormStatus']

#removing form tag in itemname as per request from winona
cidf_summary['contentItemName'] = cidf_summary['contentItemName'].apply(lambda x: x.split('_')[0])

#cidf_summary.to_csv(results_path+analysis_name+'_Content_Item_Info.csv', index = False)

In [25]:
a = pd.merge(cidf_summary, cor_ans, on = ['contentItemName'], how = 'left')

a[a['correctAnswer']!=a['corans_cor']]

Unnamed: 0,contentItemName,displaySeq,count_att,count_seen,num_correct,first_date,last_date,interactionType,countchoices,correctAnswer,corans_cor
53,n022331,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...",2230,2230,726.0,2015-09-30 12:33:45.933,2021-10-12 14:33:02.903,order-interaction,4,3142,2413


# Making ncounts file for items

In [50]:
raw_ft_items_counts = pd.merge(response_df[['contentItemName', 'jasperUserId']].drop_duplicates(), ft_items, left_on='contentItemName', right_on=analysis_name, how = 'right', indicator=True).groupby(by = [analysis_name], as_index = False, dropna=False).agg(initial_count = ('jasperUserId', 'nunique'))
raw_ft_items_counts.rename(columns = {analysis_name : 'contentItemName'}, inplace = True)

raw_ft_items_counts = pd.merge(raw_ft_items_counts, cidf_summary[['contentItemName', 'count_att']].drop_duplicates(), on = ['contentItemName'], how = 'left')
raw_ft_items_counts.rename(columns = {'count_att':'after_cleaning'}, inplace = True)

raw_ft_items_counts.to_csv(results_path + analysis_name + '_raw_FT_items_Counts.csv', index = False)

In [51]:
raw_ft_items_counts

Unnamed: 0,contentItemName,initial_count,after_cleaning
0,n022267,2272,2227
1,n022268,2272,2226
2,n022269,2272,2224
3,n022270,2272,2230
4,n022271,2272,2217
...,...,...,...
65,n022344,2272,2226
66,n022345,2272,2227
67,n022346,2272,2226
68,n022347,2272,2226
