In [None]:
import dill
import os
import sys
print(os.path.abspath(''))
sys.path.append(os.path.abspath(''))
from utilities import *

In [None]:
for f in [GRADE_PATH, ID_MAP_FILE_PATH, APPINVENTOR_FILE_PATH, EDX_CLICKSTREAM_FILE_PATH]:
    if not os.path.exists(f):
        %run link_edx_appinventor.ipynb
        break

In [None]:
def get_subsection(resource):
    return subsection_of_resources[resource]


def get_subsection_resources(resource=None, subsection=None):
    if subsection and not resource:
        return resources_by_subsection[subsection]
    elif resource and not subsection:
        return resources_by_subsection[get_subsection(resource)]
    else:
        print("Both resource and subsection are none")

        
def update_display_names_and_tabs(display_names, tabs, display_name, tab, subsection, tabs_col):
    display_names.append(display_name)
    tabs[subsection] = tab
    tabs_col.append(tab)

def should_ignore(user, reset_ignored_users=False):
    if not hasattr(should_ignore, 'ignored_users') or reset_ignored_users:
        should_ignore.ignored_users = []
    
    if user in users_to_ignore.username.values:
        print("Ignoring {} user {}".format(
            users_to_ignore[users_to_ignore.username == user][COHORT_NAME].values[0], 
            user)
        )
        
        should_ignore.ignored_users.append(user)

        return True
    
    return False

def get_user_urls_dfs():
    user_urls_dfs = []
    
    for user, rows in grouped:
        if should_ignore(user): continue

        times = pd.to_datetime(rows['time'])

        pages = rows['page']

        event_types = rows['event_type']
        
        events = rows['event']

        user_url = pd.DataFrame(
            zip(times, pages, event_types, events), 
            columns=['time', 'page', 'event_type', 'event']
        ).dropna(subset=['page'])

        # Check if empty after removing NaN values
        if not user_url.size: continue

        user_url = user_url.sort_values(by='time')

        display_names = []
        tabs_col = []

        tab_num = 1
        most_recent_tabs = collections.defaultdict(lambda: 1)

        for index, row in user_url.iterrows():
            p = row.page
            
            try:
                p_id = p.split('/')[-2]

            # If no display name found, append page url id
            except IndexError:
                display_names.append(p)
                tabs_col.append(-1)
                continue
            
            # Get the incorrect display name, which we will soon correct
            display_name = course_df[course_df[FILE_NAME] == p_id][DISPLAY_NAME].values            
            
            try:
                display_name = display_name[0]
            except IndexError:
                display_names.append(p_id)
                tabs_col.append(-1)
                continue
            
            if display_name in HOMEPAGE_NAMES:
                display_names.append(display_name)
                tabs_col.append(0)
                continue
                
            subsection = get_subsection(display_name)
            subsection_resources = get_subsection_resources(subsection=subsection)
            
            try:
                last_part = int(p.split('/')[-1])
            except ValueError:
                last_part = None
            
            try:
                extra_display_name = extra_urls[extra_urls.url == p].display_name.iloc[0]
            except IndexError:
                extra_display_name = None 
                
            # Get urls that come from the homepage
            if extra_display_name:
                assert subsection == get_subsection(extra_display_name)

                tab_num = get_subsection_resources(subsection=subsection).index(extra_display_name) + 1
                
                update_display_names_and_tabs(display_names, most_recent_tabs, extra_display_name, tab_num, subsection, tabs_col)

            # Get links which end with their tab number
            elif last_part and last_part <= len(subsection_resources) and last_part >= 0:
                display_name = subsection_resources[last_part - 1]
                tab_num = last_part

                update_display_names_and_tabs(display_names, most_recent_tabs, display_name, tab_num, subsection, tabs_col)
            
            # Otherwise get the most recently accessed resource 
            else:
                tab_num = most_recent_tabs[subsection] - 1
                display_name = subsection_resources[tab_num]            
                display_names.append(display_name)
                tabs_col.append(tab_num + 1)

            ## Set new tab num for the next page accessed
            
            # Navigation within subsection
            if row.event_type in UNIT_NAV_EVENTS_OLD:
                most_recent_tabs[subsection] = json.loads(row.event)['new']
            
            # Navigate to next subsection
            elif row.event_type == 'edx.ui.lms.sequence.next_selected':
                new_subsection = subsection_order[subsection_order.index(subsection) + 1]
                most_recent_tabs[new_subsection] = 1
            
            # Navigate to previous subsection
            elif row.event_type == 'edx.ui.lms.sequence.previous_selected':
                new_subsection = subsection_order[subsection_order.index(subsection) - 1]
                new_subsection_resources = resources_by_subsection[new_subsection]
                most_recent_tabs[new_subsection] = len(new_subsection_resources)

        user_url['display_name'] = np.array(display_names)
        user_url['user'] = user
        user_url['tab'] = tabs_col

        user_urls_dfs.append(user_url)

    return user_urls_dfs

def get_uuid(user):
    prof_row = student_profile_df.loc[user == student_profile_df.username]
    mapped_user = prof_row.mapped_username_on_alfa.values[0]

    appin_row = appin_files_df.loc[mapped_user == appin_files_df.username]
    uuid = appin_row.uuid.values[0]
    
    return uuid

In [None]:
# Load grades
grade_df = pd.read_csv(GRADE_PATH)
grade_df = grade_df.rename(columns={'Username': USERNAME})

print(grade_df[COHORT_NAME].unique())
print(grade_df[COHORT_NAME].value_counts())
print(type(grade_df[COHORT_NAME].unique()[2]))

# Print all values for each column
for col in grade_df.columns[3:]:
    print("{}\n".format(grade_df[col].value_counts())) 

In [None]:
# Load appinventor files
appin_files_df = pd.read_csv(APPINVENTOR_FILE_PATH)
display(appin_files_df)

print('AppInventor users: {}'.format(appin_files_df[USERNAME].nunique()))
print('Number of files per user:')
print(appin_files_df[USERNAME].value_counts())

In [None]:
# Load id map
student_profile_df = pd.read_csv(ID_MAP_FILE_PATH)
display(student_profile_df)

In [None]:
clickstream_df = pd.read_csv(EDX_CLICKSTREAM_FILE_PATH)
clickstream_df = clickstream_df[~clickstream_df.page.isin(IGNORE_RESOURCES)]

whitelist = [_ for _ in clickstream_df[EVENT_TYPE].unique() if '/' not in _ and '.' not in _]
print(whitelist)
print("Event types on EDx")
df = clickstream_df[clickstream_df[EVENT_TYPE].isin(whitelist)]
print(df[EVENT_TYPE].value_counts())

In [None]:
# Get who is in the mapping
unique_users = clickstream_df.username.unique()
user_matches = {CLICK_ID_MAP: [], ID_MAP_APP: []}

for row in student_profile_df.iterrows():
    if row[1].username in unique_users:
        user_matches[CLICK_ID_MAP].append(row[1].username)
        
        if row[1].mapped_username_on_alfa in appin_files_df.username.values:
            user_matches[ID_MAP_APP].append(row[1].username)

print('Clickstream and id_map matches: {}'.format(len(user_matches[CLICK_ID_MAP])))
print('Clickstream and id_map and AppInventor matches: {}'.format(len(user_matches[ID_MAP_APP])))

In [None]:
ai_edx_usr_df = grade_df.loc[grade_df[USERNAME].isin(user_matches[ID_MAP_APP])]

print("Cohort for users with activity on AI and EDx:")
print(ai_edx_usr_df[COHORT_NAME].value_counts())

In [None]:
users_to_ignore = grade_df[grade_df[COHORT_NAME].isin(IGNORE_COHORTS)]

In [None]:
course_nodes = []

for root, _, files in os.walk(COURSE_PATH):
    if files:
        parent = os.path.split(root)[-1]

    for f in files:
        if f.endswith(SUFFIX):
            path = os.path.join(root, f)
            nodes = get_node_info(path, parent, f)

            course_nodes.extend(nodes)
                
course_df = pd.DataFrame(course_nodes)

for _, row in course_df.iterrows():
    if row.display_name in display_name_errors:
        row.display_name = display_name_errors[row.display_name]


In [None]:
matches = []

for row in clickstream_df.iterrows():
    page = row[1]['page']
    
    if page is np.nan: continue

    try:
        match = page.split('/')[-2] in course_df[FILE_NAME].values
    
        if match: matches.append(row[1])

    except IndexError:
        pass

print(len(matches))

In [None]:
extra_urls = pd.read_csv(EXTRA_URLS_CSV, dtype=str)

In [None]:
nav_urls = pd.read_csv(NAV_URLS_CSV, dtype=str)

In [None]:
resource_order_df = pd.read_json(RESOURCE_JSON, orient='records').T
resource_order = list(resource_order_df.unit.unique())

resource_order_df.section = resource_order_df.section.str.replace('\d+-', '')

In [None]:
section_order = list(resource_order_df.section.unique())
subsection_order = list(resource_order_df.subsection.unique())
resources_by_section = resource_order_df.groupby('section').unit \
                                        .unique().apply(list).to_dict()
section_of_resources = {}

# Is there a one liner for this?
for section, resources in resources_by_section.items():
    for resource in resources:
        section_of_resources[resource] = section

resources_by_subsection = resource_order_df.groupby('subsection').unit \
                                          .unique().apply(list).to_dict()
subsection_of_resources = {}

# Is there a one liner for this? make this a function
for subsection, resources in resources_by_subsection.items():
    for resource in resources:
        subsection_of_resources[resource] = subsection

In [None]:
grouped = clickstream_df.groupby(by='username')
user_urls_dfs_with_homepage = get_user_urls_dfs()

for u in user_urls_dfs_with_homepage:
    assert(u.time.is_monotonic_increasing)
    
user_urls_dfs = [u[~u.display_name.isin(HOMEPAGE_NAMES)] for u in user_urls_dfs_with_homepage]

tmp = []

# filter out empty dfs
for u in user_urls_dfs:
    if u.size:
        tmp.append(u)

user_urls_dfs = tmp

In [None]:
# Test to make sure that resources align with the tab number in their subsection
for u in user_urls_dfs_with_homepage:
    for _, row in u.iterrows():
        if row['display_name'] in HOMEPAGE_NAMES or row['display_name'] in ['search', 'bookmarks']:
            continue
            
        subsection_resources = get_subsection_resources(resource=row['display_name'])
        resource_index = subsection_resources.index(row['display_name']) + 1
    
        if resource_index != row.tab:
            print(resource_index, row.tab, row)

print(set(users_to_ignore.username.values) - set(should_ignore.ignored_users))

In [None]:
## Pages that have no clicks
resources_used = pd.concat(user_urls_dfs).display_name.unique()

for r in resource_order:
    if r not in resources_used:
        print(r)

In [None]:
ai_edx_usr_df = grade_df.loc[grade_df[USERNAME].isin(user_matches[ID_MAP_APP])]
print("Cohort for users with activity on AI and EDx:")
print(ai_edx_usr_df[COHORT_NAME].value_counts())

In [None]:
user_times = {}

for user, _ in grouped:
    if user in ai_edx_usr_df.username.values:
        uuid = get_uuid(user)

        folder = os.path.join(AI_DATA, uuid)
        scm_files = get_backups_by_type(folder, SCM)
        bky_files = get_backups_by_type(folder, BKY)
        
        user_times[user] = {SCM: list(scm_files.keys()), BKY: list(bky_files.keys())}

In [None]:
ai_edx_dfs = []

users_no_scm_bky_interaction = 0

for uu_df in user_urls_dfs:
    user = uu_df.user.iloc[0]
    
    if user in ai_edx_usr_df.username.values:
        # Ignore ones with no scm or bky interaction
        if sum([len(list(t)) for t in user_times[user].values()]):
            for file_type, times in user_times[user].items():
        
                for t in times:
                    uu_df = uu_df.append({"time": t, "page": "", "display_name": 
                                          file_type, "user": user}, ignore_index=True)
            
            uu_df = uu_df.sort_values(by='time')
            uu_df.display_name.reindex(uu_df.time)
            ai_edx_dfs.append(uu_df)
        
        else: 
            users_no_scm_bky_interaction += 1
            
for a in ai_edx_dfs:
    assert(a.time.is_monotonic_increasing)
    
print(users_no_scm_bky_interaction)

assert users_no_scm_bky_interaction == 64

In [None]:
ai_edx_users = [u.user.iloc[0] for u in ai_edx_dfs]
assert(len(set(ai_edx_users)) == len(ai_edx_users))

total_appinventor_edx_events = sum([u.shape[0] for u in ai_edx_dfs])
print("Total for AI and EDX interaction users only: {}".format(total_appinventor_edx_events))

total_course_events = 0
for u in user_urls_dfs:
    if u.user.iloc[0] not in ai_edx_users: 
        total_course_events += len(u)

print("Total for only Course events: {}".format(total_course_events))
print("Total for All AppInventor File Events: {}".format(total_appinventor_edx_events + total_course_events))

In [None]:
# Check if this copy is necessary
all_resource_dfs = ai_edx_dfs.copy()
ai_edx_dfs_users = [u.user.iloc[0] for u in ai_edx_dfs]

for u in user_urls_dfs:
    if u.user.iloc[0] not in ai_edx_dfs_users:
        all_resource_dfs.append(u)

In [None]:
# Check if this is necessary
for u in user_urls_dfs:
    u['day'] = u.time.apply(lambda t: t.date())
    
for u in ai_edx_dfs:
    u['day'] = u.time.apply(lambda t: t.date())
    
for u in all_resource_dfs:
    u['day'] = u.time.apply(lambda t: t.date())

In [None]:
resource_categories = pd.read_csv(CATEGORY_CSV, na_values=['', '\n'])
resource_categories = resource_categories.set_index(CATEGORY_RESOURCE_COLUMN).T.to_dict()

In [None]:
dill.dump_session('user_and_course_dfs.db')