# Manipulate predictions and truth values

In [1]:
import pandas as pd

d_predictions = pd.read_csv("data/modelversion320-evidencepredictions_dataset733.csv") # Import the predictions.
print(pd.value_counts(d_predictions['target']))
print(pd.value_counts(d_predictions['prediction']))

target
1    453
Name: count, dtype: int64
prediction
1    453
Name: count, dtype: int64


  print(pd.value_counts(d_predictions['target']))
  print(pd.value_counts(d_predictions['prediction']))


In [2]:
from scipy.stats import bernoulli

n = len(d_predictions)
dropout_truths = bernoulli.rvs(0.15, size=n) # We want 15% of 'actual' dropouts.
dropout_predictions = bernoulli.rvs(0.1, size=n) # We want 10% of 'predicted' dropouts.

d_predictions['target'] = dropout_truths
d_predictions['prediction'] = dropout_predictions

In [3]:
# Check that the dataset looks good now.
print(pd.value_counts(d_predictions['target']))
print(pd.value_counts(d_predictions['prediction']))

target
0    379
1     74
Name: count, dtype: int64
prediction
0    406
1     47
Name: count, dtype: int64


  print(pd.value_counts(d_predictions['target']))
  print(pd.value_counts(d_predictions['prediction']))


In [4]:
d_predictions.to_csv('data/predictions.csv', index=False) # Store dataset.

# Fake related data

In [5]:
d_related = d_predictions
d_related['id'] = d_related['sampleid'].str.split('-').str[0]

In [6]:
import numpy as np

mapping = {0: 'en', 1: 'de'}

lang_numeric = bernoulli.rvs(0.50, size=len(d_related)) # We want 30% of people who have chosen 'de' as a language.
lang_textual = np.vectorize(mapping.get)(lang_numeric) # replace numerical values with textual values for readability.

d_related['lang'] = lang_textual # We can do this since in this case, sample ids are unique.

In [7]:
pd.value_counts(d_related['lang']) # Check that the dataset looks good now.

  pd.value_counts(d_related['lang']) # Check that the dataset looks good now.


lang
en    230
de    223
Name: count, dtype: int64

In [8]:
d_related.to_csv('data/related.csv', index=False, columns=['id', 'lang']) # Store dataset.