In [None]:
import json
import jsonlines
import pandas as pd
import dateutil
from dateutil.parser import isoparse
from DirectoryGenerator import DirectoryGenerator
from DataReader import readJSONL
from datetime import datetime
import numpy as np
from bs4 import BeautifulSoup
import unicodedata
import warnings

In [None]:
dirGen = DirectoryGenerator()

In [None]:
def convertToStringId(idNum):
    return 'id_' + str(idNum)

In [None]:
def convertToDateTime(date, time):
    return datetime.strptime(date + " " + time, '%Y-%m-%d %H-%M-%S')

In [None]:
def convertToDataFrame(canvasFile):
    jsonDataItems = readJSONL(canvasFile)
    df = pd.DataFrame.from_dict(jsonDataItems)
    df['datetime'] = convertToDateTime(canvasFile.split(dirGen.getDelimiter())[-2], canvasFile.split(dirGen.getDelimiter())[-1].split('.')[0])
    return df

In [None]:
def userTypeEventNameAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    dfAgg = df.groupby(['datetime', 'metadata_context_role', 'metadata_event_name'])['datetime'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    return result

In [None]:
def loggedInCountAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    dfLoggedIn = df.loc[df['metadata_event_name'] == "logged_in"]
    dfAgg = dfLoggedIn.groupby(['datetime'])['datetime'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    return result

In [None]:
def loggedInCountPerUserAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    dfLoggedIn = df.loc[df['metadata_event_name'] == "logged_in"]
    dfAgg = dfLoggedIn.groupby(['datetime', 'metadata_user_id'])['datetime'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    return result

In [None]:
def assetCategoryPerContextAnalysis(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        dfAssets = df.loc[df['metadata_event_name'] == "asset_accessed"]
        dfAgg = dfAssets.groupby(['datetime', 'metadata_context_id', 'metadata_context_type', 'body_category'])['datetime'].count()
        result = dfAgg.to_frame(name = 'total').reset_index()
        result = result.loc[result['metadata_context_type'] == 'Course']
        result.drop(['metadata_context_type'], axis = 1, inplace = True)
        result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
        return result
    except(KeyError):
        return pd.DataFrame()

In [None]:
def userRoleListPerDatetime(canvasFile):
    df = convertToDataFrame(canvasFile)
    
    dfUserRoleList = df.groupby(['datetime', 'metadata_user_id', 'metadata_context_role'])['datetime'].count().to_frame(name = 'total').reset_index()
    dfUserRoleList.drop(['total'], axis = 1, inplace = True)
    dfUserRoleList['metadata_user_id'] = dfUserRoleList['metadata_user_id'].apply(convertToStringId)
    return dfUserRoleList

In [None]:
def conversationNetworkAnalysis(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        dfConversations = df.loc[df['metadata_event_name'] == 'conversation_message_created']
    
        dfFromTo = dfConversations.groupby(['datetime', 'metadata_user_id', 'body_author_id', 'body_conversation_id'])['datetime'].count().to_frame(name = 'total').reset_index()
        dfFromTo['body_author_id'] = dfFromTo['body_author_id'].apply(lambda nodeId: "auth_" + str(nodeId))
        dfFromTo['body_conversation_id'] = dfFromTo['body_conversation_id'].apply(lambda nodeId: "conv_" + str(nodeId))
        dfFromTo['metadata_user_id'] = dfFromTo['metadata_user_id'].apply(convertToStringId)
        return dfFromTo
    except(KeyError):
        return pd.DataFrame()

In [None]:
def courseDiscussionUserEntriesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    dfAgg = df.groupby(['datetime', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_discussion_topic_id", "body_user_id"])['datetime'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    return result

In [None]:
def courseDiscussionUserEntriesWithRepliesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    
    columns = ['datetime', 'metadata_event_name', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_discussion_topic_id", "body_discussion_entry_id", "body_user_id", "body_parent_discussion_entry_id"]
    for column in columns:
        if column not in df.columns:
            df[column] = np.nan
    
    df = df[columns]
    df = df.fillna(value={"body_parent_discussion_entry_id": 0})
    
    result = df
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    result["body_discussion_entry_id"] = result["body_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result["body_parent_discussion_entry_id"] = result["body_parent_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    return result

In [None]:
def courseGradedDiscussionUserEntriesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_submitted"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    dfAgg = df.groupby(['datetime', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_discussion_topic_id", "body_user_id"])['datetime'].count()
    result = dfAgg.to_frame(name = 'total').reset_index()
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    return result

In [None]:
def courseGradedDiscussionUserEntriesWithRepliesAnalysis(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_entry_submitted"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    
    columns = ['datetime', 'metadata_event_name', 'metadata_context_id', 'metadata_context_role', 'metadata_user_id', "body_assignment_id", "body_discussion_topic_id", "body_discussion_entry_id", "body_submission_id", "body_user_id", "body_parent_discussion_entry_id"]
    for column in columns:
        if column not in df.columns:
            df[column] = np.nan
    
    df = df[columns]
    df = df.fillna(value={"body_parent_discussion_entry_id": 0, "body_assignment_id": 0, "body_submission_id": 0})
    
    result = df
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
    result["body_discussion_entry_id"] = result["body_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result["body_parent_discussion_entry_id"] = result["body_parent_discussion_entry_id"].apply(lambda nodeId: "entry_" + str(nodeId))
    result['body_assignment_id'] = result['body_assignment_id'].apply(lambda nodeId: "assignment_" + str(nodeId))
    result['body_submission_id'] = result['body_submission_id'].apply(lambda nodeId: "submission_" + str(nodeId))
    return result

In [None]:
def courseSubmissionGrades(canvasFile):
    try:
        df = convertToDataFrame(canvasFile)
        df = df.loc[df['metadata_event_name'] == "grade_change"]
        df = df.loc[df['metadata_context_type'] == 'Course']
        df = df.loc[df['body_grading_complete'] == True]
        df = df.loc[df['body_muted'] == False]
        df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
        df = df[['datetime', 'metadata_event_time', 'metadata_context_id', 'body_assignment_id', "body_submission_id", "body_score", "body_points_possible", "body_student_id", "body_user_id"]]
        dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['metadata_context_id', 'body_assignment_id', "body_submission_id", "body_student_id", "body_user_id"], keep="last")
        result = dfAgg.reset_index()
        result.drop(['index'], axis = 1, inplace = True)
        result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
        result['body_user_id'] = result['body_user_id'].apply(lambda nodeId: "user_" + str(nodeId))
        result['body_assignment_id'] = result['body_assignment_id'].apply(lambda nodeId: "assignment_" + str(nodeId))
        result['body_submission_id'] = result['body_submission_id'].apply(lambda nodeId: "submission_" + str(nodeId))
        result['body_student_id'] = result['body_student_id'].apply(lambda nodeId: "student_" + str(nodeId))
        return result
    except(KeyError):
        return pd.DataFrame()

In [None]:
def discussionTopicUpdatedUserRoleCount(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_updated"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    result = df.groupby(['datetime', 'metadata_context_role'])['datetime'].count().to_frame('total').reset_index()
    return result

In [None]:
def discussionTopicCreatedUserRoleCount(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    result = df.groupby(['datetime', 'metadata_context_role'])['datetime'].count().to_frame('total').reset_index()
    return result

In [None]:
def discussionTopicCreationUserRoles(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df = df[['datetime', 'metadata_event_time', 'metadata_context_id', 'body_discussion_topic_id', 'metadata_context_role']]
    dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['metadata_context_id', 'body_discussion_topic_id'])
    result = dfAgg.reset_index()
    result.drop(['index'], axis = 1, inplace = True)
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    return result

In [None]:
def discussionTopicCreationUserRolesV2(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['body_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df = df[['datetime', 'metadata_event_time', 'body_context_id', 'body_discussion_topic_id', 'metadata_context_role']]
    dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['body_context_id', 'body_discussion_topic_id'])
    result = dfAgg.reset_index()
    result.drop(['index'], axis = 1, inplace = True)
    result['body_context_id'] = result['body_context_id'].apply(lambda idNum: int(idNum) + 165820000000000000)
    result['body_context_id'] = result['body_context_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    return result

In [None]:
def discussionTopicCreationUserIDs(canvasFile):
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['metadata_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df = df[['datetime', 'metadata_event_time', 'metadata_context_id', 'body_discussion_topic_id', 'metadata_user_id']]
    dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['metadata_context_id', 'body_discussion_topic_id'])
    result = dfAgg.reset_index()
    result.drop(['index'], axis = 1, inplace = True)
    result['metadata_context_id'] = result['metadata_context_id'].apply(convertToStringId)
    result['metadata_user_id'] = result['metadata_user_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    return result

In [None]:
def discussionTopicCreationContents(canvasFile):
    def htmlToString(htmlDataRaw):
        warnings.filterwarnings("ignore", module='bs4')
        htmlData = BeautifulSoup(htmlDataRaw, 'lxml').get_text(strip = True, separator = " ")
        htmlData = unicodedata.normalize("NFKD", htmlData)
        htmlData = htmlData.replace("\n", " ")
        htmlData = htmlData.replace("\t", " ")
        htmlData = htmlData.replace("\r", " ")
        htmlData = htmlData.replace("\b", "")
        return htmlData
    
    df = convertToDataFrame(canvasFile)
    df = df.loc[df['metadata_event_name'] == "discussion_topic_created"]
    df = df.loc[df['body_context_type'] == 'Course']
    df['metadata_event_time'] = df['metadata_event_time'].apply(isoparse)
    df = df[['datetime', 'metadata_event_time', 'body_context_id', 'body_discussion_topic_id', 'body_title', 'body_body']]
    dfAgg = df.sort_values(by='metadata_event_time').drop_duplicates(subset=['body_context_id', 'body_discussion_topic_id'])
    result = dfAgg.reset_index()
    result.drop(['index'], axis = 1, inplace = True)
    result['body_context_id'] = result['body_context_id'].apply(convertToStringId)
    result['body_discussion_topic_id'] = result['body_discussion_topic_id'].apply(lambda nodeId: "topic_" + str(nodeId))
    #result['body_title'] = result['body_title'].apply(htmlToString)
    #result['body_body'] = result['body_body'].apply(htmlToString)
    return result

In [None]:
def sampleCanvasDataCleanser(canvasFile):
    df = convertToDataFrame(canvasFile)
    # Some dataframe manipulation algorithms here courtesy of Pandas
    result = df
    return result