Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add code for data analysis done in the paper.
- Loading branch information
1 parent
247008d
commit f4328ca
Showing
7 changed files
with
8,052 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#### joe made this: http://goel.io/joe | ||
|
||
#####=== Python ===##### | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
|
||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
#!/usr/bin/env python | ||
import pandas as pd | ||
import dill | ||
import click | ||
import os | ||
from collections import defaultdict | ||
import datetime | ||
import numpy as np | ||
import multiprocessing as MP | ||
|
||
_df_indexed = None | ||
|
||
TIME_SCALE = 24 * 60 * 60 | ||
|
||
|
||
def _column_worker(params): | ||
idx, success_prob = params | ||
if idx == 0: | ||
return _df_indexed.groupby(level=[0, 1]).p_recall.transform( | ||
lambda x: np.cumsum([0] + [1 if r >= success_prob else 0 for r in x])[:-1]) | ||
elif idx == 1: | ||
return _df_indexed.groupby(level=[0, 1]).p_recall.transform( | ||
lambda x: np.cumsum([0] + [1 if r < success_prob else 0 for r in x][:-1])) | ||
elif idx == 2: | ||
return _df_indexed.groupby(level=[0, 1]).p_recall.transform( | ||
lambda x: np.arange(len(x))) | ||
|
||
|
||
def add_user_lexeme_columns(success_prob): | ||
"""Adds 'n_correct', 'n_wrong', 'n_total' column to the data-frame.""" | ||
|
||
if "history_seen" in _df_indexed.columns: | ||
_df_indexed['n_correct'] = _df_indexed['history_correct'] | ||
_df_indexed['n_total'] = _df_indexed['history_seen'] | ||
_df_indexed['n_wrong']= _df_indexed['n_total']-_df_indexed['n_correct'] | ||
return | ||
print("No meta info on total number of exercises") | ||
with MP.Pool(3) as pool: | ||
n_correct, n_wrong, n_total = pool.map(_column_worker, | ||
[(ii, success_prob) | ||
for ii in range(3)]) | ||
|
||
_df_indexed['n_correct'] = n_correct | ||
_df_indexed['n_wrong'] = n_wrong | ||
_df_indexed['n_total'] = n_total | ||
|
||
|
||
def convert_csv_to_dict(csv_path, | ||
dictionary_output, | ||
max_days, | ||
success_prob, | ||
force=False): | ||
"""Pre-process the CSV file and save as a dictionary.""" | ||
|
||
if os.path.exists(dictionary_output) and not force: | ||
print('{} already exists and not being forced to over-write it.'.format(dictionary_output)) | ||
return | ||
|
||
start_time = datetime.datetime.now() | ||
|
||
def elapsed(): | ||
return (datetime.datetime.now() - start_time).seconds | ||
|
||
df = pd.read_csv(csv_path) | ||
|
||
# Hack to avoid passing df_indexed as a argument to the worker function | ||
|
||
if 'n_correct' not in df.columns: | ||
print('Calculating n_wrong, n_correct and n_total') | ||
global _df_indexed | ||
# Only mergesort is stable sort. | ||
_df_indexed = df.set_index(['user_id', 'lexeme_id']).sort_values('timestamp').sort_index(kind='mergesort') | ||
|
||
add_user_lexeme_columns(success_prob) | ||
|
||
df = _df_indexed.reset_index().sort_values('timestamp') | ||
|
||
# Drop all intervals larger than 30 days. | ||
df = df[df.delta < TIME_SCALE * max_days] | ||
|
||
# results = dill.load(open(results_path, 'rb')) | ||
|
||
# map_lexeme = results['map_lexeme'] | ||
# alpha = results['alpha'] | ||
# beta = results['beta'] | ||
# lexeme_difficulty = results['lexeme_difficulty'] | ||
|
||
# n_0 = [lexeme_difficulty[map_lexeme[x]] for x in df.lexeme_id] | ||
# df['n_0'] = np.abs(n_0) | ||
# df['n_t'] = df['n_0'] * (alpha[0] ** df['n_correct']) * (beta[0] ** df['n_wrong']) | ||
# df['m_t'] = np.exp(-df['n_t'] * df['delta'] / TIME_SCALE) | ||
|
||
op_dict = defaultdict(lambda: defaultdict(lambda: [])) | ||
|
||
for ii in range(df.shape[0]): | ||
row = df.iloc[ii] | ||
u_id, l_id = row.user_id, row.lexeme_id | ||
delta = row.delta / TIME_SCALE | ||
|
||
op_dict[u_id][l_id].append({ | ||
'delta_scaled' : delta, | ||
'n_wrong' : row.n_wrong, | ||
'n_correct' : row.n_correct, | ||
'p_recall' : row.p_recall, | ||
# 'n_0' : row.n_0, | ||
'timestamp' : row.timestamp, | ||
# 'm_t' : row.m_t, | ||
# 'n_t' : row.n_t, | ||
'user_id' : u_id, | ||
'lexeme_id' : l_id | ||
}) | ||
|
||
if ii % 100000 == 0: | ||
print('Done {:0.2f}%\tElapsed = {} sec'.format(100. * ii / df.shape[0], elapsed())) | ||
|
||
print('Writing {} ...'.format(dictionary_output)) | ||
dill.dump(op_dict, open(dictionary_output, 'wb')) | ||
print('Done.') | ||
|
||
|
||
@click.command() | ||
@click.argument('csv_file') | ||
@click.argument('output_dill') | ||
@click.option('--success_prob', 'success_prob', default=0.6, type=float, help='At what recall probability is the trial considered successful.') | ||
@click.option('--max_days', 'max_days', default=30, help='Maximum number of days before a revision.') | ||
@click.option('--force/--no-force', 'force', default=False, help='Force overwrite of existing files.') | ||
def run(csv_file, output_dill, success_prob, max_days, force): | ||
"""Converts the CSV_FILE from Duolingo format to a dictionary and saves it in OUTPUT_DILL | ||
after reading the results of Half-Life regression from RESULTS_PATH.""" | ||
convert_csv_to_dict(csv_path=csv_file, dictionary_output=output_dill, | ||
max_days=max_days, success_prob=success_prob, | ||
force=force) | ||
|
||
|
||
if __name__ == '__main__': | ||
run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import matplotlib | ||
|
||
SPINE_COLOR = 'grey' | ||
def latexify(fig_width=None, fig_height=None, columns=1, largeFonts=False,font_scale=1): | ||
"""Set up matplotlib's RC params for LaTeX plotting. | ||
Call this before plotting a figure. | ||
Parameters | ||
---------- | ||
fig_width : float, optional, inches | ||
fig_height : float, optional, inches | ||
columns : {1, 2} | ||
""" | ||
|
||
# code adapted from http://www.scipy.org/Cookbook/Matplotlib/LaTeX_Examples | ||
|
||
# Width and max height in inches for IEEE journals taken from | ||
# computer.org/cms/Computer.org/Journal%20templates/transactions_art_guide.pdf | ||
|
||
assert(columns in [1,2]) | ||
|
||
if fig_width is None: | ||
fig_width = 3.39 if columns == 1 else 6.9 # width in inches | ||
|
||
if fig_height is None: | ||
golden_mean = (np.sqrt(5)-1.0)/2.0 # Aesthetic ratio | ||
fig_height = fig_width*golden_mean # height in inches | ||
|
||
MAX_HEIGHT_INCHES = 8.0 | ||
if fig_height > MAX_HEIGHT_INCHES: | ||
print("WARNING: fig_height too large:" + fig_height + | ||
"so will reduce to" + MAX_HEIGHT_INCHES + "inches.") | ||
fig_height = MAX_HEIGHT_INCHES | ||
|
||
params = {'backend': 'ps', | ||
'text.latex.preamble': ['\\usepackage{gensymb}'], | ||
'axes.labelsize': font_scale*10 if largeFonts else font_scale*7, # fontsize for x and y labels (was 10) | ||
'axes.titlesize': font_scale*10 if largeFonts else font_scale*7, | ||
'font.size': font_scale*10 if largeFonts else font_scale*7, # was 10 | ||
'legend.fontsize': font_scale*10 if largeFonts else font_scale*7, # was 10 | ||
'xtick.labelsize': font_scale*10 if largeFonts else font_scale*7, | ||
'ytick.labelsize': font_scale*10 if largeFonts else font_scale*7, | ||
'text.usetex': True, | ||
'figure.figsize': [fig_width,fig_height], | ||
'font.family': 'serif', | ||
'xtick.minor.size': 0.5, | ||
'xtick.major.pad': 1.5, | ||
'xtick.major.size': 1, | ||
'ytick.minor.size': 0.5, | ||
'ytick.major.pad': 1.5, | ||
'ytick.major.size': 1, | ||
'lines.linewidth': 0.9, | ||
'lines.markersize': 0.1, | ||
'hatch.linewidth': 0.5 | ||
} | ||
|
||
matplotlib.rcParams.update(params) | ||
plt.rcParams.update(params) | ||
|
||
|
||
def format_axes(ax): | ||
|
||
for spine in ['top', 'right']: | ||
ax.spines[spine].set_visible(False) | ||
|
||
for spine in ['left', 'bottom']: | ||
ax.spines[spine].set_color(SPINE_COLOR) | ||
ax.spines[spine].set_linewidth(0.5) | ||
|
||
ax.xaxis.set_ticks_position('bottom') | ||
ax.yaxis.set_ticks_position('left') | ||
|
||
for axis in [ax.xaxis, ax.yaxis]: | ||
axis.set_tick_params(direction='out', color=SPINE_COLOR) | ||
|
||
return ax | ||
|
Oops, something went wrong.