In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from supervised.automl import AutoML # mljar-supervised
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('./data.csv')

In [3]:
train.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [4]:
train.isna().sum()

id                         0
author                  5024
geometry                5500
pressure [MPa]          4452
mass_flux [kg/m2-s]     4791
x_e_out [-]            10415
D_e [mm]                5488
D_h [mm]                4589
length [mm]             4759
chf_exp [MW/m2]            0
dtype: int64

In [5]:
train.loc[train['author']=='Thompson', 'geometry'] = 'tube'
train.loc[train['author']=='Beus', 'geometry'] = 'annulus'
train.loc[train['author']=='Peskov', 'geometry'] = 'tube'
train.loc[train['author']=='Janssen', 'geometry'] = 'annulus'
train.loc[train['author']=='Weatherhead', 'geometry'] = 'tube'
train.loc[train['author']=='Inasaka', 'geometry'] = 'tube'
train.loc[train['author']=='Williams', 'geometry'] = 'tube'
train.loc[train['author']=='Mortimore', 'geometry'] = 'annulus'
train.loc[train['author']=='Richenderfer', 'geometry'] = 'plate'
train.loc[train['author']=='Kossolapov', 'geometry'] = 'plate'

In [6]:
train.loc[train['geometry'].isna()]

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
5,5,,,17.24,3648.0,-0.0711,,1.9,696.0,3.6
72,72,,,13.79,,0.0656,4.7,,318.0,1.9
73,73,,,,3608.0,0.1261,4.7,4.7,318.0,2.7
80,80,,,6.89,2278.0,0.1946,8.5,22.3,2743.0,2.0
121,121,,,,4986.0,-0.0297,3.0,,400.0,5.5
...,...,...,...,...,...,...,...,...,...,...
31456,31456,,,13.79,1356.0,-0.0135,5.6,5.6,2134.0,1.6
31459,31459,,,13.79,2007.0,-0.1568,,7.7,457.0,4.6
31514,31514,,,11.03,4069.0,0.0498,11.5,11.5,1727.0,2.7
31636,31636,,,12.07,,-0.0195,,1.9,152.0,5.4


In [7]:
train.loc[(train['D_e [mm]'].isna()) & (~train['D_h [mm]'].isna()), 'D_e [mm]'] = train.loc[(train['D_e [mm]'].isna()) & (~train['D_h [mm]'].isna()), 'D_h [mm]']
train.loc[(train['D_h [mm]'].isna()) & (~train['D_e [mm]'].isna()), 'D_h [mm]'] = train.loc[(train['D_h [mm]'].isna()) & (~train['D_e [mm]'].isna()), 'D_e [mm]']

In [8]:
def dummy(row):
    if row['author'] != row['author']:
        g, p = row[['geometry', 'pressure [MPa]']]
        tmp = train[(train['geometry'] == g) & (train['pressure [MPa]'] == p) & (~train['author'].isna())]['author']
        if g == g and p == p and len(tmp) > 0:
            return tmp.mode()[0]
    return row['author']

train['author'] = train.apply(dummy, axis=1)

In [9]:
train['pressure [MPa]'] = train['pressure [MPa]'].fillna(train['pressure [MPa]'].mean())

In [10]:
train[['author', 'geometry']] = train[['author', 'geometry']].fillna('Unknown')

In [11]:
def dummy(row):
    if row['mass_flux [kg/m2-s]'] != row['mass_flux [kg/m2-s]']:
        a, g = row[['author', 'geometry']]
        tmp = train[(train['author'] == a) & (train['geometry'] == g) & (~train['mass_flux [kg/m2-s]'].isna())]['mass_flux [kg/m2-s]']
        if len(tmp) > 0:
            return tmp.mean()
    return row['mass_flux [kg/m2-s]']

train['mass_flux [kg/m2-s]'] = train.apply(dummy, axis=1)

In [12]:
def dummy(row):
    if row['length [mm]'] != row['length [mm]']:
        a, g = row[['author', 'geometry']]
        tmp = train[(train['author'] == a) & (train['geometry'] == g) & (~train['length [mm]'].isna())]['length [mm]']
        if len() > 0:
            return tmp.mean()
    return row['length [mm]']

train['length [mm]'] = train.apply(dummy, axis=1)

In [13]:
def dummy(row):
    if row['D_e [mm]'] != row['D_e [mm]']:
        a, g = row[['author', 'geometry']]
        tmp = train[(train['author'] == a) & (train['geometry'] == g) & (~train['length [mm]'].isna())]['D_e [mm]']
        if len(tmp) > 0:
            return tmp.mean(), tmp.mean()
    return row[['D_e [mm]', 'D_h [mm]']]

train[['D_e [mm]', 'D_h [mm]']] = train.apply(dummy, axis=1)

In [14]:
train.isna().sum()

id                         0
author                     0
geometry                   0
pressure [MPa]             0
mass_flux [kg/m2-s]        0
x_e_out [-]            10415
D_e [mm]                   0
D_h [mm]                   0
length [mm]                0
chf_exp [MW/m2]            0
dtype: int64

In [15]:
test = train[train['x_e_out [-]'].isna()].drop(['x_e_out [-]'], axis=1).reset_index(drop=True)
train = train[~train['x_e_out [-]'].isna()].reset_index(drop=True)

In [16]:
train

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.000000,3770.000000,0.1754,10.8,10.8,432.0,3.6
1,1,Thompson,tube,10.640747,6049.000000,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,tube,13.790000,2034.000000,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.790000,3679.000000,-0.0279,5.6,15.2,2134.0,3.0
4,5,Unknown,Unknown,17.240000,3648.000000,-0.0711,1.9,1.9,696.0,3.6
...,...,...,...,...,...,...,...,...,...,...
21224,31636,Unknown,Unknown,12.070000,3152.921596,-0.0195,1.9,1.9,152.0,5.4
21225,31638,Thompson,tube,10.640747,3648.000000,-0.0487,4.7,4.7,318.0,9.0
21226,31639,Thompson,tube,10.640747,1736.000000,0.0886,7.8,7.8,591.0,2.3
21227,31641,Thompson,tube,18.270000,658.000000,-0.1224,3.0,3.0,150.0,2.3


In [17]:
test

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,4,Thompson,tube,13.790000,686.000000,11.1,11.1,457.000000,2.8
1,7,Peskov,tube,18.000000,750.000000,10.0,10.0,1650.000000,2.2
2,10,Thompson,tube,10.640747,3452.251435,1.9,1.9,152.000000,3.2
3,12,Thompson,tube,6.890000,7500.000000,12.8,12.8,1930.000000,4.8
4,23,Beus,annulus,15.510000,1355.000000,5.6,15.2,2134.000000,2.1
...,...,...,...,...,...,...,...,...,...
10410,31633,Thompson,tube,11.030000,3452.251435,11.5,11.5,610.916672,2.0
10411,31634,Richenderfer,plate,1.010000,2000.000000,15.0,120.0,10.000000,6.2
10412,31637,Weatherhead,tube,13.790000,688.000000,11.1,11.1,457.000000,2.3
10413,31640,Unknown,Unknown,13.790000,3152.921596,4.7,4.7,845.315934,3.9


In [18]:
ordinal_features = ['author', 'geometry']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고윳값을 확인후 test 데이터를 변환합니다.
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [19]:
train

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,7,3,7.000000,3770.000000,0.1754,10.8,10.8,432.0,3.6
1,1,7,3,10.640747,6049.000000,-0.0416,10.3,10.3,762.0,6.2
2,2,7,3,13.790000,2034.000000,0.0335,7.7,7.7,457.0,2.5
3,3,0,1,13.790000,3679.000000,-0.0279,5.6,15.2,2134.0,3.0
4,5,8,0,17.240000,3648.000000,-0.0711,1.9,1.9,696.0,3.6
...,...,...,...,...,...,...,...,...,...,...
21224,31636,8,0,12.070000,3152.921596,-0.0195,1.9,1.9,152.0,5.4
21225,31638,7,3,10.640747,3648.000000,-0.0487,4.7,4.7,318.0,9.0
21226,31639,7,3,10.640747,1736.000000,0.0886,7.8,7.8,591.0,2.3
21227,31641,7,3,18.270000,658.000000,-0.1224,3.0,3.0,150.0,2.3


In [20]:
test

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,4,7,3,13.790000,686.000000,11.1,11.1,457.000000,2.8
1,7,5,3,18.000000,750.000000,10.0,10.0,1650.000000,2.2
2,10,7,3,10.640747,3452.251435,1.9,1.9,152.000000,3.2
3,12,7,3,6.890000,7500.000000,12.8,12.8,1930.000000,4.8
4,23,0,1,15.510000,1355.000000,5.6,15.2,2134.000000,2.1
...,...,...,...,...,...,...,...,...,...
10410,31633,7,3,11.030000,3452.251435,11.5,11.5,610.916672,2.0
10411,31634,6,2,1.010000,2000.000000,15.0,120.0,10.000000,6.2
10412,31637,9,3,13.790000,688.000000,11.1,11.1,457.000000,2.3
10413,31640,8,0,13.790000,3152.921596,4.7,4.7,845.315934,3.9


In [21]:
train.describe()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
count,21229.0,21229.0,21229.0,21229.0,21229.0,21229.0,21229.0,21229.0,21229.0,21229.0
mean,15836.304348,6.223562,2.560036,10.635865,3068.304983,-0.000453,9.496185,13.394819,832.270517,3.809129
std,9143.412737,2.369954,0.864276,4.016657,1661.547159,0.100911,9.524184,18.589655,648.43683,1.988009
min,0.0,0.0,0.0,0.1,0.0,-0.8667,1.0,1.0,10.0,0.8
25%,7896.0,7.0,3.0,6.89,1558.570439,-0.0466,5.0,5.6,432.0,2.4
50%,15861.0,7.0,3.0,10.640747,2862.0,0.0038,8.238353,10.0,610.916672,3.4
75%,23751.0,7.0,3.0,13.79,4028.0,0.0648,10.8,11.3,914.0,4.7
max,31643.0,10.0,3.0,20.68,7975.0,0.232,120.0,120.0,3048.0,19.3


In [22]:
# train models with AutoML
automl = AutoML(mode="Compete",
                ml_task='regression',
                # optuna_time_budget=1200,
                # total_time_limit=24*3600,
                eval_metric='rmse',
                random_state=726,
                total_time_limit=3600*1,
                n_jobs=-1)
automl.fit(train[test.columns], train['x_e_out [-]'])

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 0.085318 trained in 0.21 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree rmse 0.083583 trained in 1.36 seconds
2_DecisionTree rmse 0.083583 trained in 1.27 seconds
3_DecisionTree rmse 0.086059 trained in 1

AutoML(eval_metric='rmse', ml_task='regression', mode='Compete',
       random_state=726)

In [23]:
predictions = automl.predict(test)

In [24]:
sub = pd.read_csv('./sample_submission.csv')
sub['x_e_out [-]'] = predictions
sub.to_csv('mljar_sub.csv', index=False)
sub

Unnamed: 0,id,x_e_out [-]
0,4,0.000624
1,7,-0.087989
2,10,-0.050075
3,12,0.007324
4,23,0.045641
...,...,...
10410,31633,0.071996
10411,31634,-0.048425
10412,31637,0.031339
10413,31640,-0.068329
