In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#################################
#                               #
# Get started with ASA DataFest #
#                               #
#################################

# In this Python script, we demonstrate how to:
# 1) Access the ASA DataFest 2024 dataset.
# 2) Read the data into Pandas dataframes.
# 3) Conduct basic analysis.

#################################
#                               #
#   Step 1: Access the data     #
#                               #
#################################

# To access the ASA dataset, use this link: 
# https://bwsyncandshare.kit.edu/s/ErACwGqLAJ9iyxS ##TO BE UPDATED
# Unzip the data into the same directory as this Python script.
# Read the documentation carefully to understand the contents of the dataset.

#################################
#                               #
#   Step 2: Read the data       #
#                               #
#################################

data_directory = '2024 ASA DataFest Data and Documentation-updated-2024-03-04/2024 ASA DataFest Data and Documentation-updated-2024-03-04/'
checkpoints_eoc = pd.read_csv(data_directory + 'full_03_04/checkpoints_eoc.csv')
checkpoints_pulse = pd.read_csv(data_directory + 'full_03_04/checkpoints_pulse.csv')
items = pd.read_csv(data_directory + 'full_03_04/items.csv')
media_views = pd.read_csv(data_directory + 'full_03_04/media_views.csv')
page_views = pd.read_csv(data_directory + 'full_03_04/page_views.csv')
responses = pd.read_csv(data_directory + 'full_03_04/responses.csv')


In [None]:

#################################
#                               #
#   Step 3: Example Analysis    #
#                               #
#################################

# Question: Are responses more often correct when the student has retried more 
# pages in that chapter?

# Derive each individual student's portion of correct item responses per chapter.
responses['score'] = responses['points_earned'] / responses['points_possible']
average_scores = responses.groupby(['student_id', 'chapter'])['score'].mean().reset_index()

# Derive each student's number of 'try again' clicks per chapter.
average_try_again_clicks = page_views.groupby(['student_id', 'chapter'])['tried_again_clicks'].mean().reset_index()

# Join the two tables together using 'student_id' and 'chapter' as a 
# composite key.
results = pd.merge(average_scores, average_try_again_clicks, on=['student_id', 'chapter'], how='left')


In [None]:
# Visualize the results by scatter-plotting the average scores with the average 
# number of retries.
lm_model = sns.regplot(data=results, x='tried_again_clicks', y='score', line_kws={"color":"red"})
plt.xlabel("Average Retry Clicks")
plt.ylabel("Average Score")
plt.title("Scatter Plot of Average Retry Clicks vs Average Score with Regression Line")
plt.show()

#################################
#                               #
#  Step 4: Enjoy DataFest 2024! #
#                               #
#################################
