# Part-1: Data Processing

Данные: https://www.dropbox.com/s/s4qj0fpsn378m2i/chgk.zip 

Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах (поле mask в results.pkl). Для унификации предлагаю:
* взять в тренировочный набор турниры с dateStart из 2019 года; 
* в тестовый — турниры с dateStart из 2020 года.

In [1]:
import pickle
import pandas as pd

In [2]:
# source: https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file/32216025

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

### Достаем ids турниров (2019 -- train, 2020 -- test)

In [3]:
df_tournaments = pd.DataFrame(pd.read_pickle('chgk/tournaments.pkl')).transpose()
df_tournaments = df_tournaments[df_tournaments.dateStart >= '2019-01-01']

tournaments_ids_all = df_tournaments[df_tournaments.dateStart >= '2019-01-01']
tournaments_ids_all = set(tournaments_ids_all['id'])
save_obj(tournaments_ids_all, 'tournaments_ids_all')

tournaments_ids_test = df_tournaments[df_tournaments.dateStart >= '2020-01-01']
tournaments_ids_test = set(tournaments_ids_test['id'])
save_obj(tournaments_ids_test, 'tournaments_ids_test')

tournaments_ids_train = tournaments_ids_all.difference(tournaments_ids_test)
save_obj(tournaments_ids_train, 'tournaments_ids_train')

len(tournaments_ids_all), len(tournaments_ids_train), len(tournaments_ids_test)

(1109, 687, 422)

### Среди всех турниров оставляем только турниры:
* нужных лет (2019-2020);
* с mask для всех участников (повопросные ответы)
* с teamMembers для всех участников (данные об участниках)

In [4]:
def get_results_df(tournament_ids):
    df_results = pd.read_pickle('chgk/results.pkl')
    print("full dataframe length = ", len(df_results))
    results_all = {}
    for k, v in df_results.items():
        # игнорируем турниры до 2019 года, а также пустые записи
        if k in tournament_ids and len(v) > 0:
            valid = True
            # игнорируем турниры, где нет нужных нам валидных полей
            for team_data in v:
                if 'team' not in team_data or 'mask' not in team_data or 'teamMembers' not in team_data:
                    valid = False
                    continue
                if team_data['mask'] is None or team_data['team'] is None or team_data['teamMembers'] is None:
                    valid = False
                    continue
            if valid:
                results_all[k] = v
    print("cleared dataframe length = ", len(results_all))
    return results_all

df_test = get_results_df(tournaments_ids_test)
save_obj(df_test, 'test')

full dataframe length =  5528
cleared dataframe length =  169


### Преобразуем в датафрейм ('tournament_id', 'team_id', 'player_id', 'mask')

In [5]:
def unwrap_player(df):
    df_results_cleaned = []
    for k, v in df.items():
        for team_data in v:
            team = team_data['team']
            mask = str(team_data['mask']).replace('X', '0').replace('?', '0')
            players = team_data['teamMembers']
            for player in players:
                df_results_cleaned.append([k, team['id'], player['player']['id'], mask])
    df = pd.DataFrame(df_results_cleaned)
    df.columns = ['tournament_id', 'team_id', 'player_id', 'mask']
    return df

df_train = get_results_df(tournaments_ids_train)
df_train = unwrap_player(df_train)

full dataframe length =  5528
cleared dataframe length =  671


### Преобразуем в датафрейм ('tournament_id', 'team_id', 'player_id', 'question_local_id', 'target')
#### Замечание: для этого разворачиваем mask -> (question_local_id, target)

In [None]:
def unwrap_mask(df):
    df_results_cleaned = []
    for _, row in df.iterrows():
        tt_id = row['tournament_id']
        tm_id = row['team_id']
        pr_id = row['player_id']
        mask = row['mask']
        for idx in range(len(mask)):
            df_results_cleaned.append([tt_id, tm_id, pr_id, idx, mask[idx]])
    df = pd.DataFrame(df_results_cleaned)
    df.columns = ['tournament_id', 'team_id', 'player_id', 'question_local_id', 'target']
    return df

df_train = unwrap_mask(df_train)
save_obj(df_train, 'train')