Add code for data analysis done in the paper.

Networks-Learning · Jan 11, 2019 · f4328ca · f4328ca
1 parent 247008d
commit f4328ca
Show file tree

Hide file tree

Showing 7 changed files with 8,052 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,110 @@
+#### joe made this: http://goel.io/joe
+
+#####=== Python ===#####
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+data/
diff --git a/README.md b/README.md
@@ -20,10 +20,25 @@ This code depends on the following packages:
  - `preprocesed_weights.csv` contains estimated model parameters for the [HLR model](https://github.com/duolingo/halflife-regression), as described in section 8 of supplementary materials.
  - `observations_1k.csv` contains a set of 1K user-item pairs and associated number of total/correct attempts by every user for given items. This dataset has been curated from a larger dataset released by Duolingo, available [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/N8XJME).
 
- ## Execution
+## Execution
 
- The code can by executed as follows:
+The code can by executed as follows:
 
- `python memorize.py`
+`python memorize.py`
 
- The code will use default parameter value (q) used in the code.
+The code will use default parameter value (q) used in the code.
+
+----
+
+# Experiments with Duolingo data
+
+## Pre-processing
+
+Convert to Python `dict` by `user_id, lexeme_id` and pruning it for reading it:
+
+    python dataset2dict.py ./data/raw/duolingo.csv ./data/duo_dict.dill --success_prob 0.99 --max_days 30 
+    python process_raw_data.py ./data/raw/duolingo.csv ./data/duolingo_reduced.csv
+
+## Plots
+
+See the notebook `plots.ipynb`.
diff --git a/dataset2dict.py b/dataset2dict.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+import pandas as pd
+import dill
+import click
+import os
+from collections import defaultdict
+import datetime
+import numpy as np
+import multiprocessing as MP
+
+_df_indexed = None
+
+TIME_SCALE = 24 * 60 * 60
+
+
+def _column_worker(params):
+    idx, success_prob = params
+    if idx == 0:
+        return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
+            lambda x: np.cumsum([0] + [1 if r >= success_prob else 0 for r in x])[:-1])
+    elif idx == 1:
+        return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
+            lambda x: np.cumsum([0] + [1 if r < success_prob else 0 for r in x][:-1]))
+    elif idx == 2:
+        return _df_indexed.groupby(level=[0, 1]).p_recall.transform(
+            lambda x: np.arange(len(x)))
+
+
+def add_user_lexeme_columns(success_prob):
+    """Adds 'n_correct', 'n_wrong', 'n_total' column to the data-frame."""
+
+    if "history_seen" in _df_indexed.columns:
+         _df_indexed['n_correct'] = _df_indexed['history_correct']
+         _df_indexed['n_total'] = _df_indexed['history_seen']
+         _df_indexed['n_wrong']= _df_indexed['n_total']-_df_indexed['n_correct']
+         return
+    print("No meta info on total number of exercises")
+    with MP.Pool(3) as pool:
+        n_correct, n_wrong, n_total = pool.map(_column_worker,
+                                               [(ii, success_prob)
+                                                for ii in range(3)])
+
+    _df_indexed['n_correct'] = n_correct
+    _df_indexed['n_wrong']   = n_wrong
+    _df_indexed['n_total']   = n_total
+
+
+def convert_csv_to_dict(csv_path,
+                        dictionary_output,
+                        max_days,
+                        success_prob,
+                        force=False):
+    """Pre-process the CSV file and save as a dictionary."""
+
+    if os.path.exists(dictionary_output) and not force:
+        print('{} already exists and not being forced to over-write it.'.format(dictionary_output))
+        return
+
+    start_time = datetime.datetime.now()
+
+    def elapsed():
+        return (datetime.datetime.now() - start_time).seconds
+
+    df = pd.read_csv(csv_path)
+
+    # Hack to avoid passing df_indexed as a argument to the worker function
+
+    if 'n_correct' not in df.columns:
+        print('Calculating n_wrong, n_correct and n_total')
+        global _df_indexed
+        # Only mergesort is stable sort.
+        _df_indexed = df.set_index(['user_id', 'lexeme_id']).sort_values('timestamp').sort_index(kind='mergesort')
+
+        add_user_lexeme_columns(success_prob)
+
+        df = _df_indexed.reset_index().sort_values('timestamp')
+
+    # Drop all intervals larger than 30 days.
+    df = df[df.delta < TIME_SCALE * max_days]
+
+    # results = dill.load(open(results_path, 'rb'))
+
+    # map_lexeme        = results['map_lexeme']
+    # alpha             = results['alpha']
+    # beta              = results['beta']
+    # lexeme_difficulty = results['lexeme_difficulty']
+
+    # n_0 = [lexeme_difficulty[map_lexeme[x]] for x in df.lexeme_id]
+    # df['n_0'] = np.abs(n_0)
+    # df['n_t'] = df['n_0'] * (alpha[0] ** df['n_correct']) * (beta[0] ** df['n_wrong'])
+    # df['m_t'] = np.exp(-df['n_t'] * df['delta'] / TIME_SCALE)
+
+    op_dict = defaultdict(lambda: defaultdict(lambda: []))
+
+    for ii in range(df.shape[0]):
+        row = df.iloc[ii]
+        u_id, l_id = row.user_id, row.lexeme_id
+        delta = row.delta / TIME_SCALE
+
+        op_dict[u_id][l_id].append({
+            'delta_scaled' : delta,
+            'n_wrong'      : row.n_wrong,
+            'n_correct'    : row.n_correct,
+            'p_recall'     : row.p_recall,
+            # 'n_0'          : row.n_0,
+            'timestamp'    : row.timestamp,
+            # 'm_t'          : row.m_t,
+            # 'n_t'          : row.n_t,
+            'user_id'      : u_id,
+            'lexeme_id'    : l_id
+        })
+
+        if ii % 100000 == 0:
+            print('Done {:0.2f}%\tElapsed = {} sec'.format(100. * ii / df.shape[0], elapsed()))
+
+    print('Writing {} ...'.format(dictionary_output))
+    dill.dump(op_dict, open(dictionary_output, 'wb'))
+    print('Done.')
+
+
+@click.command()
+@click.argument('csv_file')
+@click.argument('output_dill')
+@click.option('--success_prob', 'success_prob', default=0.6, type=float, help='At what recall probability is the trial considered successful.')
+@click.option('--max_days', 'max_days', default=30, help='Maximum number of days before a revision.')
+@click.option('--force/--no-force', 'force', default=False, help='Force overwrite of existing files.')
+def run(csv_file, output_dill, success_prob, max_days, force):
+    """Converts the CSV_FILE from Duolingo format to a dictionary and saves it in OUTPUT_DILL
+    after reading the results of Half-Life regression from RESULTS_PATH."""
+    convert_csv_to_dict(csv_path=csv_file, dictionary_output=output_dill,
+                        max_days=max_days, success_prob=success_prob,
+                        force=force)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/plot_utils.py b/plot_utils.py
@@ -0,0 +1,79 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+
+SPINE_COLOR = 'grey'
+def latexify(fig_width=None, fig_height=None, columns=1, largeFonts=False,font_scale=1):
+    """Set up matplotlib's RC params for LaTeX plotting.
+    Call this before plotting a figure.
+
+    Parameters
+    ----------
+    fig_width : float, optional, inches
+    fig_height : float,  optional, inches
+    columns : {1, 2}
+    """
+
+    # code adapted from http://www.scipy.org/Cookbook/Matplotlib/LaTeX_Examples
+
+    # Width and max height in inches for IEEE journals taken from
+    # computer.org/cms/Computer.org/Journal%20templates/transactions_art_guide.pdf
+
+    assert(columns in [1,2])
+
+    if fig_width is None:
+        fig_width = 3.39 if columns == 1 else 6.9 # width in inches
+
+    if fig_height is None:
+        golden_mean = (np.sqrt(5)-1.0)/2.0    # Aesthetic ratio
+        fig_height = fig_width*golden_mean # height in inches
+
+    MAX_HEIGHT_INCHES = 8.0
+    if fig_height > MAX_HEIGHT_INCHES:
+        print("WARNING: fig_height too large:" + fig_height +
+              "so will reduce to" + MAX_HEIGHT_INCHES + "inches.")
+        fig_height = MAX_HEIGHT_INCHES
+
+    params = {'backend': 'ps',
+              'text.latex.preamble': ['\\usepackage{gensymb}'],
+              'axes.labelsize': font_scale*10 if largeFonts else font_scale*7, # fontsize for x and y labels (was 10)
+              'axes.titlesize': font_scale*10 if largeFonts else font_scale*7,
+              'font.size': font_scale*10 if largeFonts else font_scale*7, # was 10
+              'legend.fontsize': font_scale*10 if largeFonts else font_scale*7, # was 10
+              'xtick.labelsize': font_scale*10 if largeFonts else font_scale*7,
+              'ytick.labelsize': font_scale*10 if largeFonts else font_scale*7,
+              'text.usetex': True,
+              'figure.figsize': [fig_width,fig_height],
+              'font.family': 'serif',
+              'xtick.minor.size': 0.5,
+              'xtick.major.pad': 1.5,
+              'xtick.major.size': 1,
+              'ytick.minor.size': 0.5,
+              'ytick.major.pad': 1.5,
+              'ytick.major.size': 1,
+              'lines.linewidth': 0.9,
+              'lines.markersize': 0.1,
+              'hatch.linewidth': 0.5
+    }
+
+    matplotlib.rcParams.update(params)
+    plt.rcParams.update(params)
+
+
+def format_axes(ax):
+
+    for spine in ['top', 'right']:
+        ax.spines[spine].set_visible(False)
+
+    for spine in ['left', 'bottom']:
+        ax.spines[spine].set_color(SPINE_COLOR)
+        ax.spines[spine].set_linewidth(0.5)
+
+    ax.xaxis.set_ticks_position('bottom')
+    ax.yaxis.set_ticks_position('left')
+
+    for axis in [ax.xaxis, ax.yaxis]:
+        axis.set_tick_params(direction='out', color=SPINE_COLOR)
+
+    return ax
+