In [None]:
import autosklearn.regression
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr, spearmanr
import sklearn
from skmisc.loess import loess
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
import sys
sys.path.insert(0,'..')
from ml import *
import pandas as pd

### Preprocess data into right format for cider

In [None]:
feats = pd.read_csv('/data/togo_anon/feats/survey_combos/survey2018_cdr2018.csv')
feats = feats[[c for c in feats.columns if 'reporting' not in c or  c == 'reporting__number_of_records']]\
    .drop(['_c0', 'canton'], axis=1)\
    .rename({'phone_number':'name'}, axis=1)
feats.to_csv('/data/togo_anon/feats/survey_combos/survey2018_cdr2018_cider.csv', index=False)

In [None]:
labels = pd.read_csv('/data/togo_anon/paper/datasets/survey2018.csv')
labels = labels[['phone_number', 'weight', 'cons']]\
    .rename({'phone_number':'name', 'cons':'label'}, axis=1)
labels.to_csv('/data/togo_anon/surveys/survey2018/survey2018_labels_cider.csv', index=False)

In [None]:
feats = pd.read_csv('/data/togo_anon/feats/survey_combos/surveysep2020_cdr2020.csv')
feats = feats[[c for c in feats.columns if 'reporting' not in c or  c == 'reporting__number_of_records']]\
    .rename({'phone_number':'name'}, axis=1)
feats.to_csv('/data/togo_anon/feats/survey_combos/survey2020_cdr2020_cider.csv', index=False)

In [None]:
labels = pd.read_csv('/data/togo_anon/paper/datasets/survey2020.csv')
labels = labels[['phone_number', 'weight', 'pmt']]\
    .rename({'phone_number':'name', 'pmt':'label'}, axis=1)
labels.to_csv('/data/togo_anon/surveys/survey2018/survey2020_labels_cider.csv', index=False)

### Standard ML

In [None]:
learner = Learner(cfg_dir='../configs/config_emily.yml')
learner.merge()

In [None]:
learner.tuned_model(model_name='gradientboosting')

In [None]:
oos = learner.oos_predictions(model='gradientboosting', tuned=True)

In [None]:
print('r2 score for gradient boosting: %.2f' % 
      r2_score(oos['true'], oos['predicted'], sample_weight=oos['weight']))

In [None]:
learner.scatter_plot(model_name='gradientboosting', tuned=True)

### AutoML

In [None]:
learner.automl(model_name='automl')

In [None]:
oos = learner.oos_predictions(model='automl', tuned=False)

In [None]:
print('R2 score for AutoML %.2f' % r2_score(oos['true'], oos['predicted'], sample_weight=oos['weight']))

In [None]:
learner.scatter_plot(model_name='automl', tuned=False)