In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from enum import Enum

In [2]:
ProblemType = Enum("ProblemType", "Regression Classification")

In [3]:
def check_problem_type(data, target_name, user_spec_type):
    if user_spec_type == 'regression':
        return ProblemType.Regression
    
    if user_spec_type == 'classification':
        return ProblemType.Classification
    
    if user_spec_type == 'auto':
        if data[target_name].dtype.kind == 'O':
            return ProblemType.Classification
        
        if len(data[target_name].unique()) <= 10:
            return ProblemType.Classification
        
        return ProblemType.Regression

In [4]:
def handle_unknowns_in_test_set(train_X, test_X, categoricals):
    for c in categoricals:
        cat_train = set(train_X[c].unique())
        cat_test = set(test_X[c].unique())
        unknowns = cat_test - cat_train
        print("unknowns before: ", unknowns)
        if unknowns:
            print("other in train: ", 'other' in cat_train)
            fill_val = 'other' if 'other' in cat_train else train_X[c].mode().values[0] 
            test_X.replace(list(unknowns), fill_val, inplace=True)
            
        cat_train = set(train_X[c].unique())
        cat_test = set(test_X[c].unique())
        unknowns = cat_test - cat_train
        print("unknowns after: ", unknowns)

In [5]:
def auto_ml(dataset_name, target_name, test_ratio=0.2, problem_type='auto'):
    # read the data
    
    data = pd.read_csv(dataset_name)
    data
    
    problem_type = check_problem_type(data, target_name, problem_type)
    print('problem_type: ', problem_type.name)
    
    # split into train and test sets
    train_X, test_X = train_test_split(data, test_size=0.2, random_state=42)

    train_Y = train_X[target_name].copy()
    train_X = train_X.drop(target_name, axis=1)

    test_Y = test_X[target_name].copy()
    test_X = test_X.drop(target_name, axis=1)
    
    data.drop(target_name, axis=1, inplace=True)
    
    # seperate numeric and categorical features
    numerics = data.select_dtypes(include=np.number).columns.tolist()
    categoricals = list(set(data.columns) - set(numerics))
    
    print("numericals:  ", numerics)
    print("categorcals: ", categoricals)
        
    # define pipelines for handling missing values
    num_pipeline = Pipeline([
                             ('imputer', SimpleImputer(strategy="median"))
                            ])

    cat_pipeline = Pipeline([
                             ('imputer', SimpleImputer(strategy="constant", fill_value='other')),
                             ('encoder', OrdinalEncoder())
                            ])

    full_pipeline = ColumnTransformer([
                                       ("num", num_pipeline, numerics),
                                       ("cat", cat_pipeline, categoricals)
                                      ])
    
    handle_unknowns_in_test_set(train_X, test_X, categoricals)
    train_X_prepared = full_pipeline.fit_transform(train_X)
    test_X_prepared = full_pipeline.transform(test_X)
    
    

In [5]:
#auto_ml(dataset_name=r"data\\santander_customer_transaction_prediction_target.csv", target_name='target')

In [4]:
#auto_ml(dataset_name=r"data\\restaurant_revenue_prediction_revenue.csv", target_name='revenue')

In [3]:
#auto_ml(dataset_name=r"data\\santander_customer_satisfication_TARGET.csv", target_name='TARGET')

In [2]:
#auto_ml(dataset_name=r"data\\sberbank_russian_housing_market_price_doc.csv", target_name='price_doc')

In [1]:
#auto_ml(dataset_name=r"data\\microsoft_malware_prediction_HasDetections.csv", target_name='HasDetections')

## tests

In [176]:
#def auto_ml(dataset_name, target_name, test_ratio=0.2):
dataset_name = r"data\\restaurant_revenue_prediction_revenue.csv"
target_name = "revenue"
data = pd.read_csv(dataset_name)
# data.drop("ID_code", axis=1, inplace=True)
print(data.columns)
train_X, test_X = train_test_split(data, test_size=0.2, random_state=42)

train_Y = train_X[target_name].copy()
train_X = train_X.drop(target_name, axis=1)

test_Y = test_X[target_name].copy()
test_X = test_X.drop(target_name, axis=1)


Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')


In [177]:
data.drop(target_name, axis=1, inplace=True)

In [24]:
train_X['ID_code'].dtype.kind == 'O'

In [79]:
len(train_X['var_1'].unique())

96574

In [97]:
data.target.dtype.kind

'i'

In [178]:
numerics = data.select_dtypes(include=np.number).columns.tolist()
categoricals = list(set(data.columns) - set(numerics))

In [179]:
categoricals

['Open Date', 'Type', 'City Group', 'City']

In [85]:
s1 = {1, 2, 3}
s2 = {1, 2}

set()

In [135]:
xz = pd.DataFrame({'a' : [1, 1, 3, 4],
                   'b' : [5, 6, 7 ,8],
                   'c': ['a1', 'a2', 'a2', 'a4']})

In [169]:
xz['c'].mode().values[0]

'a2'

In [44]:
xz['a'].replace([1, 2], np.nan, inplace=True)

In [116]:
xz.median()

a    2.5
b    6.5
c    2.5
dtype: float64

In [137]:
1 if 1 == 1 else 3

1

In [180]:
def handle_unknowns_in_test_set(train_X, test_X, categoricals):
    for c in categoricals:
        cat_train = set(train_X[c].unique())
        cat_test = set(test_X[c].unique())
        unknowns = cat_test - cat_train
        print("unknowns before: ", unknowns)
        if unknowns:
            print("other in train: ", 'other' in cat_train)
            fill_val = 'other' if 'other' in cat_train else train_X[c].mode().values[0] 
            test_X.replace(list(unknowns), fill_val, inplace=True)
            
        cat_train = set(train_X[c].unique())
        cat_test = set(test_X[c].unique())
        unknowns = cat_test - cat_train
        print("unknowns after: ", unknowns)


In [181]:
categoricals

['Open Date', 'Type', 'City Group', 'City']

In [60]:
#auto_ml(r"data\\santander_customer_transaction_prediction.csv", 1)

In [61]:
data.dtypes

var_0      float64
var_1      float64
var_2      float64
var_3      float64
var_4      float64
            ...   
var_195    float64
var_196    float64
var_197    float64
var_198    float64
var_199    float64
Length: 200, dtype: object

In [62]:
#[sum(data[i].isna()) for i in data]

In [53]:
#train_X['ID_code'][0:5] = np.nan

In [54]:
#train_X['ID_code'].iloc[0]

In [55]:
#train_X['ID_code'].isna()

In [182]:
num_pipeline = Pipeline([
                         ('imputer', SimpleImputer(strategy="median"))
                        ])

cat_pipeline = Pipeline([
                         ('imputer', SimpleImputer(strategy="constant", fill_value='other')),
                         ('encoder', OrdinalEncoder())
                        ])

full_pipeline = ColumnTransformer([
                                   ("num", num_pipeline, numerics),
                                   ("cat", cat_pipeline, categoricals)
                                  ])

handle_unknowns_in_test_set(train_X, test_X, categoricals)
train_X_prepared = full_pipeline.fit_transform(train_X)
test_X_prepared = full_pipeline.transform(test_X)

unknowns before:  {'01/22/2007', '01/09/2010', '08/16/2011', '01/07/2000', '09/21/2007', '03/15/2008', '08/25/2007', '12/31/2012', '11/05/2011', '10/09/2009', '07/13/1998', '04/21/2012', '05/09/2008', '03/16/2013', '09/01/2010', '05/11/2009', '05/04/2012', '06/25/2008', '03/01/2011', '10/12/2006', '09/27/2011', '08/30/2011', '10/09/1999', '02/28/2013', '05/09/2009'}
other in train:  False
unknowns after:  set()
unknowns before:  set()
unknowns after:  set()
unknowns before:  set()
unknowns after:  set()
unknowns before:  {'Konya', 'Trabzon', 'Bolu', 'Gaziantep'}
other in train:  False
unknowns after:  set()


In [97]:
np.where(train_X.values == np.nan)

(array([], dtype=int64), array([], dtype=int64))

In [70]:
np.where(train_X_prepared == 'other')

  """Entry point for launching an IPython kernel.


(array([], dtype=int64),)

In [None]:
X = pd.DataFrame(train_X_prapared, columns=train_X.columns, index=train_X.index)

In [159]:
X

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
153248,12.3039,-8.3899,9.1944,8.0649,9.0247,-1.9559,5.1565,21.1631,2.7437,8.5623,...,7.9504,0.9184,5.9945,11.0078,-1.0936,-2.3412,8.1712,12.9046,-1.9309,train_153248
67802,15.4069,2.782,9.2951,7.1997,8.5359,-4.5422,5.421,9.9651,4.0623,8.4689,...,5.6555,2.1527,1.3518,15.4728,0.2686,6.5523,8.4698,22.0454,1.4756,train_67802
148889,9.6427,-4.6261,6.961,5.4054,12.0859,-11.2917,4.529,13.8605,-0.8366,8.4388,...,9.1779,1.5004,1.9895,20.4072,-0.1118,0.5692,9.329,12.898,-9.4318,train_148889
103093,9.6881,-5.6696,11.2709,8.2812,13.9232,-16.1434,4.9664,20.1092,-5.9868,8.4514,...,9.2727,1.1371,3.7435,20.6906,1.3752,7.4442,9.2145,18.2777,-2.5865,train_103093
104681,7.1128,-2.083,11.4807,8.3033,10.618,-6.4743,5.0078,21.0212,-4.9779,9.126,...,8.1992,1.3436,8.8929,21.6711,-2.0557,6.4975,8.311,13.7728,-5.9028,train_104681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,11.4356,2.1235,9.6502,8.3973,12.7467,-21.6719,4.8715,11.5484,5.1568,9.2991,...,5.5809,3.3083,4.6437,18.9889,-1.1895,1.4323,8.9283,14.2385,6.9994,train_119879
103694,9.0806,-0.4575,12.2323,5.4625,9.9592,-15.0923,4.0602,13.8138,1.3653,7.4617,...,8.391,1.2103,5.0491,22.0054,-0.9007,4.7159,8.1368,14.0836,0.9827,train_103694
131932,16.776,1.0075,10.2312,8.424,9.3894,3.9643,5.2315,13.317,-3.1602,8.3381,...,8.9941,2.653,1.5596,17.4303,2.4686,5.8723,7.9628,13.4127,-10.6949,train_131932
146867,9.9775,7.448,12.9018,8.9272,9.7036,-11.1443,3.1534,20.6615,4.4957,8.5453,...,7.0073,0.9529,5.8906,19.7015,-1.3745,-3.8948,8.0624,21.3589,-2.6861,train_146867


In [164]:
np.where(X.var_199.values =='other') 

(array([], dtype=int64),)