In [1]:
import numpy as np
import pandas as pd

from new_scrubbing import *

In [2]:
analysis_name = 'Nursing Admission Test B'

results_path = 'c:\\Users\\VImmadisetty\\Downloads\\Projects\\DRCR Automation\\New DRCR\\knight\\Results\\'

data_path = 'c:\\Users\\VImmadisetty\\Downloads\\Projects\\DRCR Automation\\New DRCR\\knight\\Data\\'

# Reading the data

In [3]:
activity_df = pd.read_csv(data_path + 'activity_info.tsv', sep='\t', parse_dates=['timestamp_created', 'timestamp_completed']).drop_duplicates(ignore_index=True)
response_df = pd.read_csv(data_path + 'response_data.tsv', sep='\t')
content_df = pd.read_csv(data_path + 'content_info.tsv', sep='\t', parse_dates = ['last_modified'])

sort_item_types = ['bowtie', 'cloze-drag-drop', 'cloze-dropdown', 'cloze-dropdown-rationale', 'cloze-dropdown-table', 'drag-drop-rationale', 'matrix-multiple-select', 'multi-select', 'multiple-response-grouping', 'multiple-select', 'multiple-select-n', 'multiple-select-sata', 'hot-text', 'hot-text-table']


In [4]:
print('Shape of activity_df :', activity_df.shape)
print('Total Students : ', activity_df['student_id'].nunique())
print('Total Activities : ', activity_df['activity_id'].nunique())
print('Total Enrollments : ', activity_df['enrollment_id'].nunique())
print('Template Id : ', activity_df['template_id'].unique())

print('-'*50)

print('Shape of response_df :', response_df.shape)
print('Total Students : ', response_df['student_id'].nunique())
print('Total Activities : ', response_df['activity_id'].nunique())
print('Total Content Items : ', response_df['content_item_id'].nunique())

print('-'*50)

print('Shape of content_df :', content_df.shape)
print('Total Content Item Ids : ', content_df['content_item_id'].nunique())
print('Total Content Item Names : ', content_df['content_item_name'].nunique())
print('Is Correct_answer column available? ', 'correct_answer' in content_df.columns)
#print('Data available in correct answer columns :', content_df['correct_answer'].nunique())

Shape of activity_df : (1124, 25)
Total Students :  942
Total Activities :  1124
Total Enrollments :  1124
Template Id :  [28761]
--------------------------------------------------
Shape of response_df : (119144, 21)
Total Students :  942
Total Activities :  1124
Total Content Items :  106
--------------------------------------------------
Shape of content_df : (105, 16)
Total Content Item Ids :  105
Total Content Item Names :  105
Is Correct_answer column available?  True


# Data handling

In [5]:
unwanted_items = ['endOfTestNclex', 'sectionIntroNurseMath', 'sectionIntroNurseReading',
       'sectionIntroNurseScience', 'sectionIntroNurseWriting',
       'testIntroNclexAdmissions', 'tutorialAnswer', 'tutorialBugetTime',
       'tutorialEndingTest', 'tutorialExit', 'tutorialMoving',
       'tutorialPractice', 'tutorialReadWrite', 'tutorialTestTiming91',
       'tutoriallnstruction']

#clearing the unwanted items in response_df
response_df = response_df[~response_df['content_item_name'].isin(unwanted_items)].copy()

In [6]:
response_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'studentId',
                            'activity_id' : 'activityId',
                            'item_position': 'position',
                            'section_title' : 'sectionTitle',
                            'item_section_position' : 'displaySeq',
                            'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction_type': 'interactionType',
                            'milliseconds_used' : 'mSecUsed',
                            'is_scored' : 'scored',
                            #'scored_response' : 'score',
                            'item_score' : 'score',
                            'raw_response' : 'response',
                            'field_test' : 'fieldTest',
                           'item_status': 'responseStatus'}, inplace = True)

#response_df['correctAnswer'] = np.nan
response_df['sectionName'] = response_df['sectionTitle']
response_df['contentItemName'] = np.where(pd.isnull(response_df['contentItemName']), response_df['contentItemName'], response_df['contentItemName'].astype('str').str.lower())

In [7]:
activity_df.rename(columns={'history_db_id': 'historiesDb',
                            'student_id' : 'studentId',
                            'enrollment_id' : 'kbsEnrollmentId',
                            'activity_id' : 'activityId',
                            'template_id' : 'templateId',
                            'template_name' : 'activityName',
                            'sequence_title' : 'sequenceTitle',
                            'timestamp_created' : 'dateCreated',
                            'timestamp_completed' : 'dateCompleted',
                           'tutor_mode' : 'tutorMode',
                           'status' : 'sequenceStatus'}, inplace = True)

#merging responses and activity data and changing the names of statuses
print('Response_df size before merging with activity_df :', response_df.shape)
initi_cols = response_df.columns
response_df = pd.merge(response_df.drop(columns=['source_system']), activity_df.drop(columns=['source_system']), on = ['studentId', 'activityId', 'historiesDb'], how = 'inner')

print('Response_df size after merging with activity_df:', response_df.shape)

Response_df size before merging with activity_df : (102284, 22)
Response_df size after merging with activity_df: (102284, 42)


In [8]:
#changing kbs_enrollment_id to string as in cleaning function to remove 0 kbseids we are comparing as string
response_df['kbsEnrollmentId'].fillna(value = 0, inplace = True)
response_df['kbsEnrollmentId'] = response_df['kbsEnrollmentId'].astype('str')

In [9]:
#clearing content_df for unwanted items
content_df = content_df[~content_df['content_item_name'].isin(unwanted_items)].copy()

content_df = content_df[['source_system', 'content_item_id', 'content_item_name', 'content_item_type', 'count_choices', 'correct_answer', 'max_points', 'last_modified', 'interaction_type_name']]
content_df['content_item_name'] = np.where(pd.isnull(content_df['content_item_name']), content_df['content_item_name'], content_df['content_item_name'].astype('str').str.lower())

interaction_types = content_df[['content_item_name', 'interaction_type_name']].dropna().drop_duplicates(ignore_index=True)
interaction_types = interaction_types.groupby(by=['content_item_name'], as_index = False).agg(interactionTypeName=('interaction_type_name', lambda x : ', '.join(x)))

#flag to indicate wheter to sort choices
interaction_types['sort_choices'] = interaction_types['interactionTypeName'].str.split(', ').apply(lambda x: len(set(x).intersection(sort_item_types)))

content_df = pd.merge(content_df, interaction_types, on = 'content_item_name', how = 'left')
#we should re-order correct answer for the same interaction types that we are doing in response_df else there'll be mismatch
#content_df['interaction-type-tag'] = content_df['interaction-type-tag'].str.lower()
content_df.loc[content_df['sort_choices']>0, 'correct_answer'] = content_df[content_df['sort_choices']>0]['correct_answer'].apply(lambda x: resp_cleaning(x) if not pd.isna(x) else x)
content_df.loc[content_df['sort_choices']==0, 'correct_answer'] = content_df[content_df['sort_choices']==0]['correct_answer'].apply(lambda x: x.strip() if not pd.isna(x) else x)


content_df['content_item_name'] = np.where(pd.isnull(content_df['content_item_name']), content_df['content_item_name'], content_df['content_item_name'].astype('str').str.lower())

In [10]:
content_df.rename(columns = {'content_item_id' : 'contentItemId',
                            'content_item_name' : 'contentItemName',
                            'interaction-type-tag' : 'interactiontypename',
                            'count_choices' : 'countchoices',
                            'parent_item_id' : 'parentid',
                            'parent_item_name' : 'parentname',
                            'correct_answer':'correctAnswer'}, inplace = True)

content_df['contentItemName'] = np.where(pd.isnull(content_df['contentItemName']), content_df['contentItemName'], content_df['contentItemName'].astype('str').str.lower())

#mergning with response_df
resp_bef_size = response_df.shape[0]
print('shape of response_df before merging with content_df :', response_df.shape)

response_df = pd.merge(response_df, content_df.drop(columns = ['contentItemId']),
                       how = 'inner',
                      on = ['contentItemName'])
resp_aft_size = response_df.shape[0]

print('shape of response_df after merging with content_df :', response_df.shape)

if (resp_bef_size != resp_aft_size):
    print('Size of response df is not same after merging with content df')

shape of response_df before merging with content_df : (102284, 42)
shape of response_df after merging with content_df : (102284, 51)


In [11]:
#cleaning and sorting raw_response based on interaction-type
#response_df['interactionTypeName'] = response_df['interactionTypeName'].str.lower()
#sort_item_types = ['bowtie', 'cloze-drag-drop', 'cloze-dropdown', 'cloze-dropdown-rationale', 'cloze-dropdown-table', 'drag-drop-rationale', 'matrix-multiple-select', 'multi-select', 'multiple-response-grouping', 'multiple-select', 'multiple-select-n', 'multiple-select-sata']
#sorting for below item types
response_df.loc[response_df['sort_choices']>0, 'response'] = response_df[response_df['sort_choices']>0]['response'].apply(lambda x: resp_cleaning(x) if not pd.isna(x) else x)

#response_df.loc[response_df['sort_choices']==0, 'response'] = response_df[response_df['sort_choices']==0]['response'].apply(lambda x: x.strip() if not pd.isna(x) else x)

In [12]:
#summary to from response df to make test map
response_summary = response_df[response_df['sequenceStatus'].str.lower() == 'completed'].groupby(['templateId', 'activityName', 'activityId', 'studentId'], as_index = False, dropna = False).agg(num_responses = ('contentItemName', 'nunique'))\
.groupby(['templateId', 'activityName'], dropna = False, as_index = False).agg(min_resp = ('num_responses', 'min'),
                                            median_resp = ('num_responses', 'median'),
                                            max_resp = ('num_responses', 'max'),
                                            num_users = ('num_responses', 'count'))

response_summary

Unnamed: 0,templateId,activityName,min_resp,median_resp,max_resp,num_users
0,28761,nclexAdmissionsTestB,91,91.0,91,1120


In [13]:
test_map = pd.DataFrame()
test_map['templateId'] = np.nan
test_map['jasperSequenceName'] = response_summary['activityName']

test_map['sectionName'] = np.nan

test_map['numQues'] = [91]
test_map['responseThreshold'] = 0.75
test_map['minutesAllowed'] = np.nan

test_map

Unnamed: 0,templateId,jasperSequenceName,sectionName,numQues,responseThreshold,minutesAllowed
0,,nclexAdmissionsTestB,,91,0.75,


# Connecting to Db for Repeats enrolls

In [14]:
frt_enrols = pd.read_csv('frt_enrols.csv')
olc_enrols = pd.read_csv('olc_enrols.csv')
repeaters = pd.read_csv('repeaters.csv')

# Cleaning the data

## Running cleaning function

In [15]:
data_pool = {
'frt_enrols' : frt_enrols,
'olc_enrols' : olc_enrols,
'repeaters' : repeaters,
'test_map' : test_map}

In [16]:
CI_old_keys, correct_answer, result = clean_item_data(data_path = data_path,
                    results_path = results_path,
                    analysis_name = analysis_name,
                    resp = response_df,
                    data_pool = data_pool,
                    remove_dup_CIs = True,
                    remove_deleted_sequences = True,
                    remove_impo_response_scored = True,
                    remove_impo_timing_seq = True,
                    remove_seq_w_tmq = True,
                    remove_frt_users = True,
                    remove_olc_users = True,
                    remove_hsg_enrolls = True,
                    remove_repeat_test_administrations = True)

Starting clean item data function at : 2022-12-13 19:08:11
Total sequences at start: 1124
Total users at start: 942
Unique items at start: 91
Total responses at start: 102284

Sequences with dupe content items within the same exam, removed: NONE

Sequences with enrollments in "Free Trial" product are removed: NONE

Sequences with enrollments in "Online Companion" product are removed: NONE

Sequences with repeated enrollments are removed: NONE

Sequences with dupe content items within the same exam, removed: NONE

Sequence Removal
--------------------
Sequences with deleted names are removed:  4
Users removed  3
Unique items removed  0
Responses removed  364

Sequences with bad timing (mSecUsed < 0), removed: NONE

Sequences with too many questions in a section, removed: NONE

Sequences that were not the first administration for the user, removed:  181
Users removed  0
Unique items removed  0
Responses removed  16471

Sequences with items fewer than [75.]% of items attempted in a sequen

# Creating User Level Matrices

In [17]:
#Reading cleaned data
all_resp = result.copy()
#all_resp.rename(columns={'userByForm':'studentId', 'itemByForm':'contentItemName'}, inplace = True)

all_resp = all_resp.sort_values(by=['activityName', 'displaySeq'])

#col_name:filename
vars_for_matrix =  {'score':'Item_Scores',
                   'response':'Responses',
                   'mSecUsed':'Milliseconds_per_Item'}

#making matrices
big_matrix = make_user_level_matrices(all_resp,
                               vars_for_matrices = vars_for_matrix,
                               destination_file_path = results_path,
                               destination_file_name_prefix = '_User_level_',
                                analysis_name = analysis_name,
                                omit_code = '.',
                                not_seen_code = '-99',
                                use_display_order = False,
                               qbank = False,
                                #item_order_list = all_resp[['contentItemName', 'Item Status']].drop_duplicates().sort_values(by=['Item Status'])['contentItemName']
                                     )

Finished creating matrix for Item_Scores
Finished creating matrix for Responses
Finished creating matrix for Milliseconds_per_Item


# Metadata

## Sequence Level Information

In [18]:
activity_level_info = make_activity_level_info(df = all_resp,
                                              results_path = results_path,
                                              analysis_name = analysis_name)

In [19]:
activity_level_info

Unnamed: 0,studentId,activityId,dateCreated,dateCompleted,activityName,template_num_attempted,template_raw_correct,template_pTotal,template_pPlus
1,1832782498,49177403,2021-02-11 11:38:33,2021-02-11 12:59:25,nclexAdmissionsTestB,91,79.0,0.868132,0.868132
2,1830346707,49178891,2021-02-11 14:06:17,2021-02-11 16:33:07,nclexAdmissionsTestB,91,77.0,0.846154,0.846154
3,1831647047,49212605,2021-02-15 11:33:54,2021-02-15 14:15:35,nclexAdmissionsTestB,91,67.0,0.736264,0.736264
4,1832781745,49225293,2021-02-16 12:04:06,2021-02-16 13:32:11,nclexAdmissionsTestB,91,79.0,0.868132,0.868132
5,1832785542,49226917,2021-02-16 14:09:45,2021-02-16 15:46:13,nclexAdmissionsTestB,91,54.0,0.593407,0.593407
...,...,...,...,...,...,...,...,...,...
1112,1834259946,53347667,2022-09-14 17:03:11,2022-09-14 18:32:32,nclexAdmissionsTestB,91,60.0,0.659341,0.659341
1114,1834297016,53351153,2022-09-15 12:36:26,2022-09-15 14:01:41,nclexAdmissionsTestB,91,81.0,0.890110,0.890110
1115,1834293366,53351157,2022-09-15 12:37:04,2022-09-15 14:07:02,nclexAdmissionsTestB,91,59.0,0.648352,0.648352
1116,1834312703,53351191,2022-09-15 12:42:28,2022-09-15 15:10:52,nclexAdmissionsTestB,89,52.0,0.571429,0.584270


## User Level Information

In [20]:
user_info = make_user_level_info(df = all_resp,
                                results_path = results_path,
                                analysis_name = analysis_name,
                                test_map = test_map)

In [21]:
user_info

activityName,studentId,num_panel_tests_taken,total_panel,total_seen,total_att,total_correct,panel_ptotal,panel_pplus,incl_nclexAdmissionsTestB
0,1528916701,1,91,91,91,75.0,0.824176,0.824176,1
1,1826333934,1,91,91,91,66.0,0.725275,0.725275,1
2,1826668064,1,91,91,91,73.0,0.802198,0.802198,1
3,1826729563,1,91,91,91,48.0,0.527473,0.527473,1
4,1827027680,1,91,91,91,63.0,0.692308,0.692308,1
...,...,...,...,...,...,...,...,...,...
932,1834312703,1,91,91,89,52.0,0.571429,0.584270,1
933,1834313297,1,91,91,91,72.0,0.791209,0.791209,1
934,1834314918,1,91,91,91,74.0,0.813187,0.813187,1
935,1834315844,1,91,91,91,51.0,0.560440,0.560440,1


## Item Level Information

In [26]:
df = result.drop_duplicates().copy()
#making a seen field for an item to make count_seen
df['itemSeen'] = df['responseStatus']!='not-reached'

cidf_summary = df.sort_values(by=['displaySeq']).groupby(by=['contentItemName'], as_index=False, dropna = False).agg(displaySeq = ('displaySeq', lambda x: ', '.join(x.drop_duplicates().astype(str))),
                                                                                                                                  count_att = ('attempted', 'sum'),
                                                                                                                                                   count_seen = ('itemSeen', 'sum'),
                                                                                                                                                   num_correct = ('score', 'sum'),
                                                                                                                                                   first_date = ('dateCreated', 'min'),
                                                                                                                                                   last_date = ('dateCreated', 'max'))
        
df['latestItem'] = df.groupby(['contentItemName'])['dateCreated'].rank(method = 'first', ascending = False)

itemMetaData = df[df['latestItem']==1][['contentItemName','interactionTypeName', 'activityName', 'answerKey',
                             'countchoices']].copy()

cidf_summary = pd.merge(cidf_summary, itemMetaData, on = ['contentItemName'], how = 'inner')

cidf_summary = cidf_summary[['contentItemName', 'activityName', 'interactionTypeName', 'displaySeq', 'countchoices', 'count_seen', 'count_att', 'num_correct', 'answerKey', 'first_date', 'last_date']]

cidf_summary.to_csv(results_path+analysis_name+'_Content_Item_Info.csv', index = False)

In [28]:
cidf_summary

Unnamed: 0,contentItemName,activityName,interactionTypeName,displaySeq,countchoices,count_seen,count_att,num_correct,answerKey,first_date,last_date
0,n028297.01,nclexAdmissionsTestB,singleAnswerMultipleChoice,1.0,5.0,937,933,573.0,5.0,2019-09-27 11:34:55,2022-09-15 17:09:04
1,n028297.02,nclexAdmissionsTestB,singleAnswerMultipleChoice,2.0,5.0,937,937,819.0,3.0,2019-09-27 11:34:55,2022-09-15 17:09:04
2,n028297.03,nclexAdmissionsTestB,singleAnswerMultipleChoice,3.0,5.0,937,937,430.0,5.0,2019-09-27 11:34:55,2022-09-15 17:09:04
3,n028297.04,nclexAdmissionsTestB,singleAnswerMultipleChoice,4.0,5.0,937,937,629.0,4.0,2019-09-27 11:34:55,2022-09-15 17:09:04
4,n028297.05,nclexAdmissionsTestB,singleAnswerMultipleChoice,5.0,5.0,937,937,814.0,3.0,2019-09-27 11:34:55,2022-09-15 17:09:04
...,...,...,...,...,...,...,...,...,...,...,...
86,n031830.02,nclexAdmissionsTestB,singleAnswerMultipleChoice,17.0,4.0,937,937,849.0,4.0,2019-09-27 11:34:55,2022-09-15 17:09:04
87,n031830.03,nclexAdmissionsTestB,singleAnswerMultipleChoice,18.0,4.0,937,937,577.0,2.0,2019-09-27 11:34:55,2022-09-15 17:09:04
88,n031831.01,nclexAdmissionsTestB,singleAnswerMultipleChoice,19.0,4.0,937,937,662.0,4.0,2019-09-27 11:34:55,2022-09-15 17:09:04
89,n031831.02,nclexAdmissionsTestB,singleAnswerMultipleChoice,20.0,4.0,937,937,414.0,1.0,2019-09-27 11:34:55,2022-09-15 17:09:04
