Skip to content

Commit

Permalink
Add code for data analysis done in the paper.
Browse files Browse the repository at this point in the history
  • Loading branch information
musically-ut committed Jan 11, 2019
1 parent 247008d commit f4328ca
Show file tree
Hide file tree
Showing 7 changed files with 8,052 additions and 4 deletions.
110 changes: 110 additions & 0 deletions .gitignore
@@ -0,0 +1,110 @@
#### joe made this: http://goel.io/joe

#####=== Python ===#####

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

data/
23 changes: 19 additions & 4 deletions README.md
Expand Up @@ -20,10 +20,25 @@ This code depends on the following packages:
- `preprocesed_weights.csv` contains estimated model parameters for the [HLR model](https://github.com/duolingo/halflife-regression), as described in section 8 of supplementary materials.
- `observations_1k.csv` contains a set of 1K user-item pairs and associated number of total/correct attempts by every user for given items. This dataset has been curated from a larger dataset released by Duolingo, available [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/N8XJME).

## Execution
## Execution

The code can by executed as follows:
The code can by executed as follows:

`python memorize.py`
`python memorize.py`

The code will use default parameter value (q) used in the code.
The code will use default parameter value (q) used in the code.

----

# Experiments with Duolingo data

## Pre-processing

Convert to Python `dict` by `user_id, lexeme_id` and pruning it for reading it:

python dataset2dict.py ./data/raw/duolingo.csv ./data/duo_dict.dill --success_prob 0.99 --max_days 30
python process_raw_data.py ./data/raw/duolingo.csv ./data/duolingo_reduced.csv

## Plots

See the notebook `plots.ipynb`.
136 changes: 136 additions & 0 deletions dataset2dict.py
@@ -0,0 +1,136 @@
#!/usr/bin/env python
import pandas as pd
import dill
import click
import os
from collections import defaultdict
import datetime
import numpy as np
import multiprocessing as MP

_df_indexed = None

TIME_SCALE = 24 * 60 * 60


def _column_worker(params):
idx, success_prob = params
if idx == 0:
return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
lambda x: np.cumsum([0] + [1 if r >= success_prob else 0 for r in x])[:-1])
elif idx == 1:
return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
lambda x: np.cumsum([0] + [1 if r < success_prob else 0 for r in x][:-1]))
elif idx == 2:
return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
lambda x: np.arange(len(x)))


def add_user_lexeme_columns(success_prob):
"""Adds 'n_correct', 'n_wrong', 'n_total' column to the data-frame."""

if "history_seen" in _df_indexed.columns:
_df_indexed['n_correct'] = _df_indexed['history_correct']
_df_indexed['n_total'] = _df_indexed['history_seen']
_df_indexed['n_wrong']= _df_indexed['n_total']-_df_indexed['n_correct']
return
print("No meta info on total number of exercises")
with MP.Pool(3) as pool:
n_correct, n_wrong, n_total = pool.map(_column_worker,
[(ii, success_prob)
for ii in range(3)])

_df_indexed['n_correct'] = n_correct
_df_indexed['n_wrong'] = n_wrong
_df_indexed['n_total'] = n_total


def convert_csv_to_dict(csv_path,
dictionary_output,
max_days,
success_prob,
force=False):
"""Pre-process the CSV file and save as a dictionary."""

if os.path.exists(dictionary_output) and not force:
print('{} already exists and not being forced to over-write it.'.format(dictionary_output))
return

start_time = datetime.datetime.now()

def elapsed():
return (datetime.datetime.now() - start_time).seconds

df = pd.read_csv(csv_path)

# Hack to avoid passing df_indexed as a argument to the worker function

if 'n_correct' not in df.columns:
print('Calculating n_wrong, n_correct and n_total')
global _df_indexed
# Only mergesort is stable sort.
_df_indexed = df.set_index(['user_id', 'lexeme_id']).sort_values('timestamp').sort_index(kind='mergesort')

add_user_lexeme_columns(success_prob)

df = _df_indexed.reset_index().sort_values('timestamp')

# Drop all intervals larger than 30 days.
df = df[df.delta < TIME_SCALE * max_days]

# results = dill.load(open(results_path, 'rb'))

# map_lexeme = results['map_lexeme']
# alpha = results['alpha']
# beta = results['beta']
# lexeme_difficulty = results['lexeme_difficulty']

# n_0 = [lexeme_difficulty[map_lexeme[x]] for x in df.lexeme_id]
# df['n_0'] = np.abs(n_0)
# df['n_t'] = df['n_0'] * (alpha[0] ** df['n_correct']) * (beta[0] ** df['n_wrong'])
# df['m_t'] = np.exp(-df['n_t'] * df['delta'] / TIME_SCALE)

op_dict = defaultdict(lambda: defaultdict(lambda: []))

for ii in range(df.shape[0]):
row = df.iloc[ii]
u_id, l_id = row.user_id, row.lexeme_id
delta = row.delta / TIME_SCALE

op_dict[u_id][l_id].append({
'delta_scaled' : delta,
'n_wrong' : row.n_wrong,
'n_correct' : row.n_correct,
'p_recall' : row.p_recall,
# 'n_0' : row.n_0,
'timestamp' : row.timestamp,
# 'm_t' : row.m_t,
# 'n_t' : row.n_t,
'user_id' : u_id,
'lexeme_id' : l_id
})

if ii % 100000 == 0:
print('Done {:0.2f}%\tElapsed = {} sec'.format(100. * ii / df.shape[0], elapsed()))

print('Writing {} ...'.format(dictionary_output))
dill.dump(op_dict, open(dictionary_output, 'wb'))
print('Done.')


@click.command()
@click.argument('csv_file')
@click.argument('output_dill')
@click.option('--success_prob', 'success_prob', default=0.6, type=float, help='At what recall probability is the trial considered successful.')
@click.option('--max_days', 'max_days', default=30, help='Maximum number of days before a revision.')
@click.option('--force/--no-force', 'force', default=False, help='Force overwrite of existing files.')
def run(csv_file, output_dill, success_prob, max_days, force):
"""Converts the CSV_FILE from Duolingo format to a dictionary and saves it in OUTPUT_DILL
after reading the results of Half-Life regression from RESULTS_PATH."""
convert_csv_to_dict(csv_path=csv_file, dictionary_output=output_dill,
max_days=max_days, success_prob=success_prob,
force=force)


if __name__ == '__main__':
run()
79 changes: 79 additions & 0 deletions plot_utils.py
@@ -0,0 +1,79 @@
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

SPINE_COLOR = 'grey'
def latexify(fig_width=None, fig_height=None, columns=1, largeFonts=False,font_scale=1):
"""Set up matplotlib's RC params for LaTeX plotting.
Call this before plotting a figure.
Parameters
----------
fig_width : float, optional, inches
fig_height : float, optional, inches
columns : {1, 2}
"""

# code adapted from http://www.scipy.org/Cookbook/Matplotlib/LaTeX_Examples

# Width and max height in inches for IEEE journals taken from
# computer.org/cms/Computer.org/Journal%20templates/transactions_art_guide.pdf

assert(columns in [1,2])

if fig_width is None:
fig_width = 3.39 if columns == 1 else 6.9 # width in inches

if fig_height is None:
golden_mean = (np.sqrt(5)-1.0)/2.0 # Aesthetic ratio
fig_height = fig_width*golden_mean # height in inches

MAX_HEIGHT_INCHES = 8.0
if fig_height > MAX_HEIGHT_INCHES:
print("WARNING: fig_height too large:" + fig_height +
"so will reduce to" + MAX_HEIGHT_INCHES + "inches.")
fig_height = MAX_HEIGHT_INCHES

params = {'backend': 'ps',
'text.latex.preamble': ['\\usepackage{gensymb}'],
'axes.labelsize': font_scale*10 if largeFonts else font_scale*7, # fontsize for x and y labels (was 10)
'axes.titlesize': font_scale*10 if largeFonts else font_scale*7,
'font.size': font_scale*10 if largeFonts else font_scale*7, # was 10
'legend.fontsize': font_scale*10 if largeFonts else font_scale*7, # was 10
'xtick.labelsize': font_scale*10 if largeFonts else font_scale*7,
'ytick.labelsize': font_scale*10 if largeFonts else font_scale*7,
'text.usetex': True,
'figure.figsize': [fig_width,fig_height],
'font.family': 'serif',
'xtick.minor.size': 0.5,
'xtick.major.pad': 1.5,
'xtick.major.size': 1,
'ytick.minor.size': 0.5,
'ytick.major.pad': 1.5,
'ytick.major.size': 1,
'lines.linewidth': 0.9,
'lines.markersize': 0.1,
'hatch.linewidth': 0.5
}

matplotlib.rcParams.update(params)
plt.rcParams.update(params)


def format_axes(ax):

for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)

for spine in ['left', 'bottom']:
ax.spines[spine].set_color(SPINE_COLOR)
ax.spines[spine].set_linewidth(0.5)

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

for axis in [ax.xaxis, ax.yaxis]:
axis.set_tick_params(direction='out', color=SPINE_COLOR)

return ax

0 comments on commit f4328ca

Please sign in to comment.