In [None]:
import json
import jsonlines
import pandas as pd
import dateutil
from dateutil.parser import isoparse
from DirectoryGenerator import DirectoryGenerator
from DataReader import readJSONL
from datetime import datetime
import numpy as np

In [None]:
dirGen = DirectoryGenerator()

In [None]:
def convertToStringId(idNum):
    return 'id_' + str(idNum)

In [None]:
def convertToDateTime(date, time):
    return datetime.strptime(date + " " + time, '%Y-%m-%d %H-%M-%S')

In [None]:
def convertToDataFrame(canvasFile):
    jsonDataItems = readJSONL(canvasFile)
    df = pd.DataFrame.from_dict(jsonDataItems)
    df['collected_at'] = convertToDateTime(canvasFile.split(dirGen.getDelimiter())[-2], canvasFile.split(dirGen.getDelimiter())[-1].split('.')[0])
    return df

In [None]:
def userTypeEventNameAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
    dfAgg = df.groupby(['metadata_event_time_date', 'metadata_context_role', 'metadata_event_name'])['collected_at'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    return result

In [None]:
def loggedInCountAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
    dfLoggedIn = df.loc[df['metadata_event_name'] == "logged_in"]
    dfAgg = dfLoggedIn.groupby(['metadata_event_time_date'])['collected_at'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    return result

In [None]:
def loggedInCountPerUserAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
    dfLoggedIn = df.loc[df['metadata_event_name'] == "logged_in"]
    dfAgg = dfLoggedIn.groupby(['metadata_event_time_date', 'metadata_user_id'])['collected_at'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    return result

In [None]:
def assetCategoryPerContextAnalysis(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
        df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
        dfAssets = df.loc[df['metadata_event_name'] == "asset_accessed"]
        dfAgg = dfAssets.groupby(['metadata_event_time_date', 'metadata_context_id', 'metadata_context_type', 'body_category'])['collected_at'].count()
        result = dfAgg.to_frame(name = 'total').reset_index()
        result = result.loc[result['metadata_context_type'] == 'Course']
        result.drop(['metadata_context_type'], axis = 1, inplace = True)
        result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
        return result
    except(KeyError):
        return pd.DataFrame()

In [None]:
def userRoleListPerDatetime(canvasFile):
    df = convertToDataFrame(canvasFile)
    dfUserRoleList = df.groupby(['collected_at', 'metadata_user_id', 'metadata_context_role'])['collected_at'].count().to_frame(name = 'total').reset_index()
    dfUserRoleList.drop(['total'], axis = 1, inplace = True)
    dfUserRoleList['metadata_user_id'] = dfUserRoleList['metadata_user_id'].apply(convertToStringId)
    return dfUserRoleList

In [None]:
def conversationNetworkAnalysis(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
        df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
        dfConversations = df.loc[df['metadata_event_name'] == 'conversation_message_created']
    
        dfFromTo = dfConversations.groupby(['metadata_event_time_date', 'metadata_user_id', 'body_author_id', 'body_conversation_id'])['collected_at'].count().to_frame(name = 'total').reset_index()
        dfFromTo['body_author_id'] = dfFromTo['body_author_id'].apply(lambda nodeId: "auth_" + str(nodeId))
        dfFromTo['body_conversation_id'] = dfFromTo['body_conversation_id'].apply(lambda nodeId: "conv_" + str(nodeId))
        dfFromTo['metadata_user_id'] = dfFromTo['metadata_user_id'].apply(convertToStringId)
        return dfFromTo
    except(KeyError):
        return pd.DataFrame()

In [None]:
def courseDiscussionUserEntriesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
    dfAgg = df.groupby(['metadata_event_time_date', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_discussion_topic_id", "body_user_id"])['collected_at'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    return result

In [None]:
def courseDiscussionUserEntriesWithRepliesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    
    columns = ['collected_at', 'metadata_event_time', 'metadata_event_name', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_assignment_id", "body_discussion_topic_id", "body_discussion_entry_id", "body_submission_id", "body_user_id", "body_parent_discussion_entry_id", "body_text"]
    for column in columns:
        if column not in df.columns:
            df[column] = np.nan
    
    df = df[columns]
    df = df.fillna(value={"body_parent_discussion_entry_id": 0})
    
    result = df
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    result["body_discussion_entry_id"] = result["body_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result["body_parent_discussion_entry_id"] = result["body_parent_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    return result

In [None]:
def courseGradedDiscussionUserEntriesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_submitted"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df['metadata_event_time_date'] = df['metadata_event_time'].apply(lambda dt: dt.date())
    dfAgg = df.groupby(['metadata_event_time_date', 'metadata_event_time', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_discussion_topic_id", "body_user_id"])['collected_at'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    return result

In [None]:
def courseGradedDiscussionUserEntriesWithRepliesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_submitted"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    
    columns = ['collected_at', 'metadata_event_time', 'metadata_event_name', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_assignment_id", "body_discussion_topic_id", "body_discussion_entry_id", "body_submission_id", "body_user_id", "body_parent_discussion_entry_id", "body_text"]
    for column in columns:
        if column not in df.columns:
            df[column] = np.nan
    
    df = df[columns]
    df = df.fillna(value={"body_parent_discussion_entry_id": 0, "body_assignment_id": 0, "body_submission_id": 0})
    
    result = df
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    result["body_discussion_entry_id"] = result["body_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result["body_parent_discussion_entry_id"] = result["body_parent_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result['body_assignment_id'] = result['body_assignment_id'].apply(lambda nodeId: "assignment_" + str(nodeId))
    result['body_submission_id'] = result['body_submission_id'].apply(lambda nodeId: "submission_" + str(nodeId))
    return result

In [None]:
def discussionTopicCreationInfo(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['body_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df = df[['collected_at', 'metadata_event_time', 'body_context_id', 'body_discussion_topic_id', 'metadata_user_id', 'metadata_context_role', 'body_title', 'body_body']]
    dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['body_context_id', 'body_discussion_topic_id'])
    result = dfAgg.reset_index()
    result.drop(['index'], axis = 1, inplace = True)
    result['body_context_id'] = result['body_context_id'].apply(lambda idNum: int(idNum) + 165820000000000000)
    result['body_context_id'] = result['body_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    return result

In [None]:
def courseInfo(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        df = df.loc[df['metadata_event_name'] == "course_created"]
        df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
        df = df[['collected_at', 'metadata_event_time', 'body_course_id', 'body_created_at', 'body_name', 'body_updated_at']]
        dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['body_course_id'])
        result = dfAgg.reset_index()
        result.drop(['index'], axis = 1, inplace = True)
        result['body_course_id'] = result['body_course_id'].apply(convertToStringId)
        return result
    except(KeyError):
        return pd.DataFrame({'collected_at': [np.nan], 'metadata_event_time': [np.nan], 'body_course_id': [np.nan], 'body_created_at': [np.nan], 'body_name': [np.nan], 'body_updated_at': [np.nan]})

In [None]:
def courseSubmissionGrades(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        df = df.loc[df['metadata_event_name'] == "grade_change"]
        df = df.loc[df['metadata_context_type'] == 'Course']
        df = df.loc[df['body_grading_complete'] == True]
        df = df.loc[df['body_muted'] == False]
        df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
        df = df[['collected_at', 'metadata_event_time', 'metadata_context_id', 'body_assignment_id', "body_submission_id", "body_score", "body_points_possible", "body_student_id", "body_user_id"]]
        dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['metadata_context_id', 'body_assignment_id', "body_submission_id", "body_student_id", "body_user_id"], keep="last")
        result = dfAgg.reset_index()
        result.drop(['index'], axis = 1, inplace = True)
        result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
        result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
        result['body_assignment_id'] = result['body_assignment_id'].apply(lambda nodeId: "assignment_" + str(nodeId))
        result['body_submission_id'] = result['body_submission_id'].apply(lambda nodeId: "submission_" + str(nodeId))
        result['body_student_id'] = result['body_student_id'].apply(lambda nodeId: "student_" + str(nodeId))
        return result
    except(KeyError):
        return pd.DataFrame({'collected_at': [np.nan], 'metadata_event_time': [np.nan], 'metadata_context_id': [np.nan], 'body_assignment_id': [np.nan], "body_submission_id": [np.nan], "body_score": [np.nan], "body_points_possible": [np.nan], "body_student_id": [np.nan], "body_user_id": [np.nan]})

In [None]:
def sampleCanvasDataCleanser(canvasFile):
    df = convertToDataFrame(canvasFile)
    # Some dataframe manipulation algorithms here courtesy of Pandas
    result = df
    return result