In [57]:
import numpy as np, pandas as pd
import datetime

In [58]:
# загрузка данных

train = pd.read_csv("./init_train.csv")
test = pd.read_csv("./init_test.csv")

In [59]:
# преобразование дат в удобный формат

months = {
    'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
    'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
}

def str2date(text):
    return datetime.date(2000 + int(text[-2:]), months[text[2:5]], int(text[:2]))

def str2time(text):
    return datetime.time(int(text[:2]), int(text[3:5]), int(text[6:]))

for df in [train, test]:
    df['date'] = df['TRDATETIME'].apply(lambda x: str2date(x[:7]))
    df['time'] = df['TRDATETIME'].apply(lambda x: str2time(x[8:]))
    df.drop(['TRDATETIME', 'PERIOD'], axis = 1, inplace = True)
    
dates = list(set(train['date'].unique()).union(set(test['date'].unique())))

In [9]:
# сохраняем метки класса как numpy массив
labels = train[['cl_id', 'target_flag']].groupby('cl_id', sort = False).mean()['target_flag'].values
np.save('./labels.npy', labels)

In [60]:
currencies = set(train['currency'].unique()).union(test['currency'].unique())

In [61]:
# преобразование цифровых кодов валют в трехбуквенный формат
import iso4217parse

code2curr = dict([(curr, iso4217parse.parse(int(curr))[0][0]) for curr in currencies if curr != 810])
code2curr[810] = 'RUB'

for df in [train, test]:
    df['currency'] = df['currency'].apply(lambda x: code2curr[x])

In [62]:
#
from currency_converter import CurrencyConverter

conv = CurrencyConverter(fallback_on_missing_rate=True, fallback_on_wrong_date=True)
converter_currencies = conv.currencies

# какие-то валюты просто погуглил на данный момент
# пробовал цб, рбк валюты - не очень работают
missing_currencies = {'NAD': 0.20033263, 'TWD': 0.47458577, 'NPR': 1.69867921, 'AED': 0.05848635, 'LKR': 2.50820435, 
                'TND': 0.03879050, 'KES': 1.59947001, 'ARS': 0.32722322, 'UGX': 59.24062151, 'COP': 44.73694845, 
                'BSD': 0.01592499, 'RSD': 1.56611873, 'BHD': 0.00599454, 'SCR': 0.21460415, 'CRC': 9.01095250, 
                'VND': 362.20724787, 'MOP': 0.12900691, 'MVR': 0.24655653, 'BAM': 0.02595629, 'GEL': 0.03914975, 
                'DOP': 0.78806470, 'PEN': 0.05181999, 'MUR': 0.54451015, 'MNT': 38.29211235, 'MAD': 0.14872383, 
                'QAR': 0.05798403}

def get_rubbles(amount, currency, day):
    if currency == 'RUB':
        return amount
    else:
        
        
        if currency in converter_currencies:
            return conv.convert(amount, currency, 'RUB', date = day)
        
        if currency in list(missing_currencies):
            return amount / missing_currencies[currency]
        return amount
    
for df in [train, test]:
    df['amount'] = df.apply(lambda x: get_rubbles(x['amount'], x['currency'], x['date']), axis = 1)

In [63]:
# дисконтирование всех денежных потоков к самому первому дню в данных
delta = np.log(1 + 0.09)
first_day = np.min(train['date'].values)

# финансовый год - 360 дней
def discount(amount, day):
    return amount * np.exp(delta * (first_day - day).days/360)

for df in [train, test]:
    df['amount'] = df.apply(lambda x: discount(x['amount'], x['date']), axis = 1)

In [64]:
# заполняем NaN в столбце 'channel_type' как нулевой тип
for df in [train, test]:
    df.fillna('type0', inplace = True)

In [65]:
train.to_csv("./train.csv", index = False)
test.to_csv("./test.csv", index = False)