In [1]:
import requests
import pandas as pd
from sklearn.utils import Bunch
import numpy as np
from scipy.io.arff import loadarff
# from sklearn.pipeline import Pipeline
# import datetime as dt
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [2]:
# def log(f):
#     def wrapper(df, *args, **kwargs):
#         t1 = dt.datetime.now()
#         result = f(df, *args, **kwargs)
#         t2 = dt.datetime.now()
#         print(f"{f.__name__} took {t2 - t1}, shape={len(result)}")
#         return result
#     return wrapper

In [3]:
def encode_data(df, y_fld=None, skip_flds=None,ignore_flds=None, do_scale=False, 
                na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res=[df,y]
    if do_scale: res = res + [mapper]
    return res

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and (max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def process_categoricals(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
    return df

In [4]:
def default_pipeline(df):
    df = (df
          .pipe(process_categoricals)
          .pipe(encode_data)[0]
            )
    return df

In [5]:
def read_crime():
    feature_names = pd.read_csv(f"raw_datasets/crime/communities.names", sep="\t", 
                                header=None, index_col=None)[57:185]
    feature_names = feature_names.replace('@attribute', '', regex=True)
    feature_names = feature_names.replace('numeric', '', regex=True)
    feature_names = feature_names.replace('string', '', regex=True)
    feature_names = feature_names.reset_index(drop=True)
    
    # remove trailing/leading whitespace, cast to np arr
    feature_names = feature_names.apply(lambda x: x.str.strip())
    feature_names = list(feature_names[0])
    feature_names = np.array(feature_names)

    df = pd.read_csv('raw_datasets/crime/communities.data', sep=',',
                     header=None, names=feature_names)
    df = default_pipeline(df)
    # drop debug columns
    df = df.drop(columns = df.columns[:5])
    data_bunch = Bunch(
             data=df.drop('ViolentCrimesPerPop', axis=1).to_numpy(),
             target=np.array(df.ViolentCrimesPerPop),
             feature_names=feature_names[5:-1]
                        )
    return data_bunch

In [6]:
def read_ames_housing():
    data = pd.read_csv('raw_datasets/ames_housing/ames.txt', sep='\t')
    data = data.drop(['Order', 'PID'], axis=1)
    target = data.SalePrice
    data = data.drop('SalePrice', axis=1)
    feature_names = data.columns
    feature_names = np.array(feature_names)
    
    data = default_pipeline(data)
    
    data = np.array(data)
    feature_names = np.array(feature_names)
    target = np.array(target)
    data_bunch = Bunch(
             data=data,
             target=target,
             feature_names=feature_names
                        )
    return data_bunch

In [7]:
def read_phishing():
    phishing, _ = loadarff('raw_datasets/phishing/phishing.arff')
    feature_names = phishing.dtype.names
    feature_names = np.array(feature_names)
    df = pd.DataFrame(phishing).astype('int')
    
    df = default_pipeline(df)
    
    data = df.drop('Result', axis=1)
    data = np.array(data)
    target = df.Result
    target = np.array(target)
    # xgboost sucks and wants you to give labels in in the range of [0, inf] 
    # instead of [-1, inf]
    target += 1
    data += 1
    # xgboost sucks and wants you the class labels to be in order
    target[target == 2] = 1
    data_bunch = Bunch(
             data=data,
             target=target,
             feature_names=feature_names[:-1]
                        )    
    return data_bunch

In [8]:
def read_mushroom():
    df = pd.read_csv('raw_datasets/mushrooms/mushrooms.csv', sep=',')
    df = default_pipeline(df)
    
    target = np.array(df['class'])
    data = df.drop('class', axis=1)
    feature_names = data.columns
    data = np.array(data)
    feature_names = np.array(feature_names)
    # xgboost sucks and wants you the class labels to be in order
    target -= 1
    data_bunch = Bunch(
             data=data,
             target=target,
             feature_names=feature_names
                        )
    return data_bunch

In [9]:
def read_ames_housing():
    data = pd.read_csv('raw_datasets/ames_housing/ames.txt', sep='\t'
                      )
    data = data.drop(['Order', 'PID'], axis=1)
    
    # I would recommend removing any houses with more than 4000 square feet 
    # from the data set (which eliminates these 5 unusual observations)
    # before assigning it to students.
    # source: http://jse.amstat.org/v19n3/decock/DataDocumentation.txt
    # section: SPECIAL NOTES
    data = data[data['Gr Liv Area'] < 4000]
    
    target = data.SalePrice
    data = data.drop('SalePrice', axis=1)
    feature_names = data.columns
    data = default_pipeline(data)
    
    # todo: check if this is ok, currently 
    # it seems that our pipeline does not parse float dtypes
    # fill float dtypes with -999 (out of distribution value)
    data = data.fillna(-999)
    data = np.array(data)
    feature_names = np.array(feature_names)
    target = np.array(target)
    data_bunch = Bunch(
             data=data,
             target=target,
             feature_names=feature_names
                        )
    return data_bunch

In [10]:
# # test 
# a = read_crime()
# b = read_ames_housing()
# c = read_mushroom()
# d = read_phishing()