In [7]:
import pandas as pd
import pickle
import os

In [8]:
def transform(ds: str, suffix:str='', sep:str=';', ext:str='csv', drop:list=None, cat_names:list=None, to_disk:bool=True, target:str='', d_basepath='data'):
    if target == '':  target = False
    if drop is None: drop = []
    if cat_names is None: cat_names = []
        
    base_fname = f'{d_basepath}/{ds}/{ds}'
    source_fname = f'{base_fname}{suffix}.{ext}'
    fname = os.path.basename(os.path.splitext(source_fname)[0])
    print(f'Basepath: {base_fname}')
    print(f'Source file: {source_fname}')
    
    df = pd.read_csv(source_fname, sep=sep)
    df = df.drop(drop, axis=1)
    df_num, subs = cat_to_num(df, cat_names=cat_names)
    pickle.dump(subs, open(f'../data/{ds}/subs.pkl', 'wb'))

    if target:
        y = df_num[target]
        df_num = df_num.drop([target], axis=1)
        print(f'y_dim: {len(y.unique())}')
        
    if to_disk:
        if target:
            target_fname_y = f'{base_fname}_labels.csv'
            print(f'Target file label: {target_fname_y}')
            y.to_csv(target_fname_y, sep=';', index=False)
        target_fname = f'{base_fname}.csv'
        print(f'Target file: {target_fname}')
        df_num.to_csv(target_fname, sep=';', index=False)
    if target:
        return df_num, y, subs
    return df_num, subs

In [9]:
def cat_to_num(df, sep=',', cat_names=None):
    if cat_names is None: cat_names = []
    subs = {}
    df_num = df.copy()
    
    # TRANSFORM TO SET TO PREVENT DOUBLE FACTORIZATION
    for z in set(df_num.select_dtypes(include=['object']).columns.tolist() + cat_names):
        y, label = pd.factorize(df[z])
        subs[z] = {'y': y, 'label': label}
        df_num[z] = y
    return df_num, subs

In [10]:
a, b, c = transform('Ticket', suffix='_textual', to_disk=True, target='OpCarrierGroup')
a.head()

Basepath: data/Ticket/Ticket
Source file: data/Ticket/Ticket_textual.csv
y_dim: 15
Target file label: data/Ticket/Ticket_labels.csv
Target file: data/Ticket/Ticket.csv


Unnamed: 0,ItinID,MktID,MktCoupons,Year,Quarter,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCountry,...,OpCarrier,BulkFare,Passengers,MktFare,MktDistance,MktDistanceGroup,MktMilesFlown,NonStopMiles,ItinGeoType,MktGeoType
0,201513772623,20151377262301,1,2015,1,12892,1289203,32575,0,0,...,0,0.0,12.0,120.0,1242.0,3,1242.0,1242.0,2,2
1,201513772624,20151377262401,1,2015,1,12892,1289203,32575,0,0,...,0,0.0,1.0,121.0,1242.0,3,1242.0,1242.0,2,2
2,201513772625,20151377262501,1,2015,1,12892,1289203,32575,0,0,...,0,0.0,1.0,126.0,1242.0,3,1242.0,1242.0,2,2
3,201513772626,20151377262601,1,2015,1,12892,1289203,32575,0,0,...,0,0.0,25.0,129.0,1242.0,3,1242.0,1242.0,2,2
4,201513772627,20151377262701,1,2015,1,12892,1289203,32575,0,0,...,0,0.0,3.0,131.0,1242.0,3,1242.0,1242.0,2,2
