In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')
sys.path.append(str(current_dir) + '/../input/')

from codes import utils, loader

import importlib
for m in [utils, loader]:
    importlib.reload(m)

In [4]:
PATH = '../input'

train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
labels_df = pd.read_csv(f'{PATH}/train_labels.csv')
submission_df = pd.read_csv(f'{PATH}/sample_submission.csv')

In [11]:
labels_df['q'] = labels_df['session_id'].apply(lambda x: int(x.split('_')[-1][1:]))
question_means = labels_df.groupby('q').correct.agg('mean').to_dict()
question_means

{1: 0.7240003395874013,
 2: 0.9787757874182867,
 3: 0.9321674165888445,
 4: 0.7993038458273198,
 5: 0.5463961287036251,
 6: 0.7720519568724,
 7: 0.7292639443076662,
 8: 0.6143136089651074,
 9: 0.7354614143815265,
 10: 0.5003820358264708,
 11: 0.6441973002801596,
 12: 0.857458188301214,
 13: 0.27048136514135324,
 14: 0.7100772561337975,
 15: 0.482978181509466,
 16: 0.7378385261906784,
 17: 0.6852873758383564,
 18: 0.9505900331097716}

In [12]:
from sklearn.metrics import f1_score

labels_df['m'] = labels_df.q.map(question_means)

In [13]:
# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    labels_df['p'] = (labels_df.m > threshold).astype('int')
    m = f1_score(labels_df.correct.values, labels_df.p.values, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 

In [14]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
# The API will deliver two dataframes in this specific order,
# for every session+level grouping (one group per session for each checkpoint)
for (sample_submission, test) in iter_test:
    if counter==0:
        display(sample_submission.head())
        display(test.head())
        print(test.shape)
        
    ## users make predictions here using the test data
    for index,row in sample_submission.iterrows():
        q = int( row['session_id'].split('_')[-1][1:] )
        p = int( question_means[q]>best_threshold )
        sample_submission.loc[index,'correct'] = p
    
    ## env.predict appends the session+level sample_submission to the overall
    ## submission
    env.predict(sample_submission)
    counter += 1