In [1]:
import pandas as pd
from numpy import NaN
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

In [2]:
df = pd.read_csv('C:/Users/Kevin/Desktop/hybrid-data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   serial         275 non-null    int64  
 1   sample id      236 non-null    object 
 2   material type  274 non-null    object 
 3   SA             274 non-null    float64
 4   TPV            260 non-null    object 
 5   MPV            0 non-null      float64
 6   C              245 non-null    float64
 7   H              207 non-null    float64
 8   N              259 non-null    float64
 9   O              114 non-null    float64
 10  T              275 non-null    int64  
 11  P              275 non-null    float64
 12  CO2            275 non-null    float64
 13  ref            140 non-null    float64
dtypes: float64(9), int64(2), object(3)
memory usage: 30.2+ KB


In [4]:
df.head()

Unnamed: 0,serial,sample id,material type,SA,TPV,MPV,C,H,N,O,T,P,CO2,ref
0,1,HNM-1,Microporous,976.0,1.07,,40.18,4.46,42.16,5.25,0,1.0,4.295,1.0
1,2,HNM-1,Microporous,976.0,1.07,,40.18,4.46,42.16,5.25,25,1.0,2.795,1.0
2,3,HNM-2,Microporous,807.0,,,,,,,0,1.0,3.0,1.0
3,4,HNM-3,Microporous,513.0,,,,,,,0,1.0,2.159,1.0
4,5,CHNM-1M,Hierarchical,870.0,0.83,,64.38,4.17,9.79,,0,1.0,3.363,2.0


In [5]:
df_1 = df.drop(columns=['serial','sample id','ref','MPV'])
df_1 = df_1[df_1.N != None]
df_1.head()

Unnamed: 0,material type,SA,TPV,C,H,N,O,T,P,CO2
0,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295
1,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795
2,Microporous,807.0,,,,,,0,1.0,3.0
3,Microporous,513.0,,,,,,0,1.0,2.159
4,Hierarchical,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363


In [6]:
df_1.loc[2,'TPV'] = NaN

In [7]:
df_1.head()

Unnamed: 0,material type,SA,TPV,C,H,N,O,T,P,CO2
0,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295
1,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795
2,Microporous,807.0,,,,,,0,1.0,3.0
3,Microporous,513.0,,,,,,0,1.0,2.159
4,Hierarchical,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363


In [8]:
df_1['material type'].value_counts()

Microporous     174
Hierarchical     49
Mesoporous       45
Macroporous       6
Name: material type, dtype: int64

In [9]:
df_1['material type']=df_1['material type'].astype('category')

In [10]:
df_1['codes'] = df_1['material type'].cat.codes
df_1.head()

Unnamed: 0,material type,SA,TPV,C,H,N,O,T,P,CO2,codes
0,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295,3
1,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795,3
2,Microporous,807.0,,,,,,0,1.0,3.0,3
3,Microporous,513.0,,,,,,0,1.0,2.159,3
4,Hierarchical,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363,0


In [11]:
enc = OneHotEncoder()

In [12]:
enc_data = pd.DataFrame(enc.fit_transform(df_1[['codes']]).toarray())

In [13]:
new_df = df_1.join(enc_data)
new_df.head()

Unnamed: 0,material type,SA,TPV,C,H,N,O,T,P,CO2,codes,0,1,2,3,4
0,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295,3,0.0,0.0,0.0,0.0,1.0
1,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795,3,0.0,0.0,0.0,0.0,1.0
2,Microporous,807.0,,,,,,0,1.0,3.0,3,0.0,0.0,0.0,0.0,1.0
3,Microporous,513.0,,,,,,0,1.0,2.159,3,0.0,0.0,0.0,0.0,1.0
4,Hierarchical,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363,0,0.0,1.0,0.0,0.0,0.0


In [14]:
df_1.head()

Unnamed: 0,material type,SA,TPV,C,H,N,O,T,P,CO2,codes
0,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295,3
1,Microporous,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795,3
2,Microporous,807.0,,,,,,0,1.0,3.0,3
3,Microporous,513.0,,,,,,0,1.0,2.159,3
4,Hierarchical,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363,0


In [15]:
df_1 = df_1.drop(columns=['codes'])

In [16]:
one_hot_enc_data = pd.get_dummies(df_1,columns=['material type'])
one_hot_enc_data

Unnamed: 0,SA,TPV,C,H,N,O,T,P,CO2,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous
0,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295,0,0,0,1
1,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795,0,0,0,1
2,807.0,,,,,,0,1.0,3.000,0,0,0,1
3,513.0,,,,,,0,1.0,2.159,0,0,0,1
4,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,614.0,,65.10,,10.50,,25,1.0,4.040,0,0,0,1
271,1004.0,,68.80,,12.90,,25,0.1,1.030,0,0,0,1
272,1004.0,,68.80,,12.90,,25,1.0,4.030,0,0,0,1
273,726.0,,67.80,,17.40,,25,0.1,0.840,0,0,1,0


In [17]:
df_1.columns

Index(['material type', 'SA', 'TPV', 'C', 'H', 'N', 'O', 'T', 'P', 'CO2'], dtype='object')

In [18]:
one_hot_enc_data.columns

Index(['SA', 'TPV', 'C', 'H', 'N', 'O', 'T', 'P', 'CO2',
       'material type_Hierarchical', 'material type_Macroporous',
       'material type_Mesoporous', 'material type_Microporous'],
      dtype='object')

In [19]:
one_hot_enc_data = one_hot_enc_data[['material type_Hierarchical', 'material type_Macroporous',
       'material type_Mesoporous', 'material type_Microporous','SA', 'TPV', 'C', 'H', 'N', 'O', 'T', 'P', 'CO2']]

In [20]:
one_hot_enc_data

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0,0,0,1,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295
1,0,0,0,1,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795
2,0,0,0,1,807.0,,,,,,0,1.0,3.000
3,0,0,0,1,513.0,,,,,,0,1.0,2.159
4,1,0,0,0,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363
...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,0,0,0,1,614.0,,65.10,,10.50,,25,1.0,4.040
271,0,0,0,1,1004.0,,68.80,,12.90,,25,0.1,1.030
272,0,0,0,1,1004.0,,68.80,,12.90,,25,1.0,4.030
273,0,0,1,0,726.0,,67.80,,17.40,,25,0.1,0.840


In [21]:
one_hot_enc_data.dropna(subset=['N'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_hot_enc_data.dropna(subset=['N'],inplace=True)


In [22]:
one_hot_enc_data.reset_index(drop=True,inplace=True)

In [23]:
one_hot_enc_data.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0,0,0,1,976.0,1.07,40.18,4.46,42.16,5.25,0,1.0,4.295
1,0,0,0,1,976.0,1.07,40.18,4.46,42.16,5.25,25,1.0,2.795
2,1,0,0,0,870.0,0.83,64.38,4.17,9.79,,0,1.0,3.363
3,1,0,0,0,870.0,0.83,64.38,4.17,9.79,,25,1.0,2.25
4,1,0,0,0,639.0,0.52,62.49,4.0,9.05,,0,1.0,2.977


In [24]:
#now the part left is preprocessing of the final dataset, so the steps are 
#splitting of dataset
#scale and transform
#handle missing values(knn imputer)
#modelling and testing


In [25]:
train,test = train_test_split(one_hot_enc_data,train_size=0.9, random_state=0)

In [26]:
scaler = MinMaxScaler()

In [27]:
train = pd.DataFrame(scaler.fit_transform(train),columns=train.columns)
train.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0.0,0.0,0.0,1.0,0.650554,0.297107,0.731636,0.178909,0.142251,,0.0,1.0,0.633907
1,0.0,0.0,0.0,1.0,0.361945,0.261923,0.874041,0.141091,0.302813,,1.0,1.0,0.18059
2,0.0,0.0,0.0,1.0,0.187675,0.097733,0.530272,0.069818,0.113323,0.847027,1.0,0.055556,0.027027
3,0.0,0.0,0.0,1.0,1.0,0.586396,0.889755,0.035636,0.137208,0.178166,0.0,1.0,0.502457
4,0.0,0.0,0.0,1.0,0.417537,0.201564,0.754294,0.043636,0.151805,0.363141,1.0,1.0,0.308354


In [28]:
test = pd.DataFrame(scaler.transform(test),columns=test.columns)
test.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0.0,0.0,1.0,0.0,0.200686,,0.71044,0.949091,0.005839,,1.0,1.0,0.255528
1,0.0,0.0,0.0,1.0,0.216457,0.101642,0.606408,0.094545,0.190021,,0.0,1.0,0.341032
2,1.0,0.0,0.0,0.0,0.324883,0.641126,0.672676,0.058182,0.53742,0.226282,1.0,1.0,0.201474
3,0.0,0.0,0.0,1.0,0.293341,0.172009,0.686929,0.085091,0.235403,,1.0,1.0,0.335381
4,0.0,0.0,0.0,1.0,0.607184,0.265833,0.745523,0.149091,0.107749,,1.0,1.0,0.415233


In [29]:
imputer = KNNImputer(n_neighbors=10)

In [30]:
train = pd.DataFrame(imputer.fit_transform(train),columns=train.columns)
train.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0.0,0.0,0.0,1.0,0.650554,0.297107,0.731636,0.178909,0.142251,0.309873,0.0,1.0,0.633907
1,0.0,0.0,0.0,1.0,0.361945,0.261923,0.874041,0.141091,0.302813,0.181956,1.0,1.0,0.18059
2,0.0,0.0,0.0,1.0,0.187675,0.097733,0.530272,0.069818,0.113323,0.847027,1.0,0.055556,0.027027
3,0.0,0.0,0.0,1.0,1.0,0.586396,0.889755,0.035636,0.137208,0.178166,0.0,1.0,0.502457
4,0.0,0.0,0.0,1.0,0.417537,0.201564,0.754294,0.043636,0.151805,0.363141,1.0,1.0,0.308354


In [31]:
test = pd.DataFrame(imputer.transform(test),columns=test.columns)
test.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P,CO2
0,0.0,0.0,1.0,0.0,0.200686,0.253831,0.71044,0.949091,0.005839,0.387653,1.0,1.0,0.255528
1,0.0,0.0,0.0,1.0,0.216457,0.101642,0.606408,0.094545,0.190021,0.517703,0.0,1.0,0.341032
2,1.0,0.0,0.0,0.0,0.324883,0.641126,0.672676,0.058182,0.53742,0.226282,1.0,1.0,0.201474
3,0.0,0.0,0.0,1.0,0.293341,0.172009,0.686929,0.085091,0.235403,0.327167,1.0,1.0,0.335381
4,0.0,0.0,0.0,1.0,0.607184,0.265833,0.745523,0.149091,0.107749,0.258034,1.0,1.0,0.415233


In [32]:
X_train,y_train = train.iloc[:,:-1],train.CO2
y_train

0      0.633907
1      0.180590
2      0.027027
3      0.502457
4      0.308354
         ...   
228    0.196560
229    0.562654
230    0.481572
231    0.375921
232    0.045455
Name: CO2, Length: 233, dtype: float64

In [33]:
X_test,y_test = test.iloc[:,:-1],test.CO2
X_test.head()

Unnamed: 0,material type_Hierarchical,material type_Macroporous,material type_Mesoporous,material type_Microporous,SA,TPV,C,H,N,O,T,P
0,0.0,0.0,1.0,0.0,0.200686,0.253831,0.71044,0.949091,0.005839,0.387653,1.0,1.0
1,0.0,0.0,0.0,1.0,0.216457,0.101642,0.606408,0.094545,0.190021,0.517703,0.0,1.0
2,1.0,0.0,0.0,0.0,0.324883,0.641126,0.672676,0.058182,0.53742,0.226282,1.0,1.0
3,0.0,0.0,0.0,1.0,0.293341,0.172009,0.686929,0.085091,0.235403,0.327167,1.0,1.0
4,0.0,0.0,0.0,1.0,0.607184,0.265833,0.745523,0.149091,0.107749,0.258034,1.0,1.0


In [34]:
X_train,X_test,y_train,y_test = train_test_split(one_hot_enc_data.iloc[:,:-1],one_hot_enc_data.CO2,train_size=0.9,random_state=0)

In [35]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233 entries, 96 to 172
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   material type_Hierarchical  233 non-null    uint8  
 1   material type_Macroporous   233 non-null    uint8  
 2   material type_Mesoporous    233 non-null    uint8  
 3   material type_Microporous   233 non-null    uint8  
 4   SA                          233 non-null    float64
 5   TPV                         225 non-null    object 
 6   C                           221 non-null    float64
 7   H                           185 non-null    float64
 8   N                           233 non-null    float64
 9   O                           108 non-null    float64
 10  T                           233 non-null    int64  
 11  P                           233 non-null    float64
dtypes: float64(6), int64(1), object(1), uint8(4)
memory usage: 17.3+ KB


In [36]:
scalerX = MinMaxScaler()
scalery = MinMaxScaler()

In [37]:
X_train = scalerX.fit_transform(X_train)
y_train = scalery.fit_transform(y_train)
X_test = scalerX.transform(X_test)
y_test = scalery.transform(y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[5.61  1.92  0.67  4.54  2.96  2.99  0.46  4.8   6.11  2.34  6.24  4.25
 1.59  1.522 4.159 2.96  1.788 1.878 4.4   5.45  2.89  0.54  3.29  1.5
 1.25  2.4   2.8   2.86  1.41  1.62  4.19  0.67  3.73  1.06  2.295 3.18
 1.068 1.67  0.97  4.19  2.07  2.88  3.05  3.78  3.33  3.93  3.55  1.86
 3.46  2.636 1.95  4.17  3.77  2.54  2.14  2.977 1.73  6.06  6.23  2.68
 3.46  1.67  2.7   2.06  2.09  1.17  0.93  2.25  4.95  3.727 1.6   1.81
 8.59  6.3   0.65  0.61  1.023 7.81  3.93  2.99  1.39  0.81  3.56  3.363
 1.36  2.86  4.02  5.01  1.84  1.82  2.66  3.68  7.44  2.114 5.74  2.29
 3.35  5.182 2.62  4.21  2.16  4.04  5.84  2.69  3.65  3.82  1.01  5.87
 5.9   1.23  2.73  3.16  1.52  2.47  3.64  2.02  1.21  3.33  5.32  3.64
 3.79  4.295 5.03  3.57  4.96  1.5   2.1   7.6   4.03  4.23  2.386 2.16
 3.92  1.82  2.74  3.99  3.86  0.84  2.204 7.219 1.29  5.5   1.75  5.61
 1.86  5.23  1.48  1.94  1.86  1.05  0.977 4.4   2.06  3.99  4.62  1.83
 4.05  2.795 3.61  2.84  5.63  4.75  4.28  5.33  2.25  1.318 4.32  4.24
 3.34  0.88  2.    2.3   5.88  0.86  3.96  2.87  0.45  3.74  1.09  5.17
 2.23  3.11  5.51  2.02  1.38  0.85  0.69  6.14  4.5   3.43  3.35  2.77
 2.9   2.55  5.36  1.03  1.93  4.26  1.48  1.68  3.69  4.8   2.43  0.78
 2.75  2.023 1.78  2.12  2.11  0.7   2.2   3.7   3.2   3.77  3.714 6.7
 4.57  2.91  1.37  1.38  8.43  4.5   3.204 5.98  1.136 2.72  2.55  2.48
 2.05  5.03  4.37  3.51  0.82 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [38]:
one_hot_enc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   material type_Hierarchical  259 non-null    uint8  
 1   material type_Macroporous   259 non-null    uint8  
 2   material type_Mesoporous    259 non-null    uint8  
 3   material type_Microporous   259 non-null    uint8  
 4   SA                          259 non-null    float64
 5   TPV                         250 non-null    object 
 6   C                           245 non-null    float64
 7   H                           207 non-null    float64
 8   N                           259 non-null    float64
 9   O                           114 non-null    float64
 10  T                           259 non-null    int64  
 11  P                           259 non-null    float64
 12  CO2                         259 non-null    float64
dtypes: float64(7), int64(1), object(1),

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
scalerx = StandardScaler()
scalery = StandardScaler()


In [41]:
X_train,X_test,y_train,y_test = train_test_split(one_hot_enc_data.iloc[:,:-1],one_hot_enc_data.CO2,train_size=0.9,random_state=0)

In [42]:
scalerx.fit_transform(X_train)
scalerx.transform(X_test)

array([[-4.95973089e-01,  0.00000000e+00,  2.46182982e+00,
        -1.39619737e+00, -8.32430809e-01,             nan,
         1.82210277e-01,  8.50872547e+00, -1.35480257e+00,
                    nan,  8.82397843e-01,  4.48157268e-01],
       [-4.95973089e-01,  0.00000000e+00, -4.06201920e-01,
         7.16231117e-01, -7.59459669e-01, -9.07338486e-01,
        -3.87252280e-01, -1.39970617e-01, -2.37880126e-01,
                    nan, -1.13327566e+00,  4.48157268e-01],
       [ 2.01623843e+00,  0.00000000e+00, -4.06201920e-01,
        -1.39619737e+00, -2.57783080e-01,  2.22878460e+00,
        -2.45032979e-02, -5.08000237e-01,  1.86882230e+00,
        -5.19304648e-01,  8.82397843e-01,  4.48157268e-01],
       [-4.95973089e-01,  0.00000000e+00, -4.06201920e-01,
         7.16231117e-01, -4.03725360e-01, -4.98278952e-01,
         5.35144060e-02, -2.35658318e-01,  3.73269898e-02,
                    nan,  8.82397843e-01,  4.48157268e-01],
       [-4.95973089e-01,  0.00000000e+00, -4.0620192

In [43]:
from sklearn.impute import KNNImputer

In [44]:
imputer = KNNImputer(n_neighbors=10,weights='uniform')

In [45]:
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns=X_train.columns)

In [46]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   material type_Hierarchical  233 non-null    float64
 1   material type_Macroporous   233 non-null    float64
 2   material type_Mesoporous    233 non-null    float64
 3   material type_Microporous   233 non-null    float64
 4   SA                          233 non-null    float64
 5   TPV                         233 non-null    float64
 6   C                           233 non-null    float64
 7   H                           233 non-null    float64
 8   N                           233 non-null    float64
 9   O                           233 non-null    float64
 10  T                           233 non-null    float64
 11  P                           233 non-null    float64
dtypes: float64(12)
memory usage: 22.0 KB


In [47]:
X_test = pd.DataFrame(imputer.transform(X_test),columns=X_test.columns)

In [48]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   material type_Hierarchical  26 non-null     float64
 1   material type_Macroporous   26 non-null     float64
 2   material type_Mesoporous    26 non-null     float64
 3   material type_Microporous   26 non-null     float64
 4   SA                          26 non-null     float64
 5   TPV                         26 non-null     float64
 6   C                           26 non-null     float64
 7   H                           26 non-null     float64
 8   N                           26 non-null     float64
 9   O                           26 non-null     float64
 10  T                           26 non-null     float64
 11  P                           26 non-null     float64
dtypes: float64(12)
memory usage: 2.6 KB


In [49]:
#adaboost

In [50]:
from sklearn.ensemble import AdaBoostRegressor

In [51]:
ada_boost = AdaBoostRegressor()

In [52]:
ada_boost.fit(X_train,y_train)

In [53]:
y_pred_ada = ada_boost.predict(X_test)

In [54]:
from sklearn.metrics import r2_score

In [55]:
train_accuracy_ada = r2_score(y_train,ada_boost.predict(X_train))
train_accuracy_ada

0.7954074683178284

In [56]:
test_accuracy_ada = r2_score(y_test,y_pred_ada)
test_accuracy_ada

0.5060181767556766

In [57]:
#xgboost

In [58]:
!pip install xgboost



In [59]:
from xgboost import XGBRegressor

In [60]:
xgb = XGBRegressor()

In [61]:
xgb.fit(X_train,y_train)

In [62]:
r2_score(y_train,xgb.predict(X_train))

0.9999967764955064

In [63]:
r2_score(y_test,xgb.predict(X_test))

0.7883733987531482

In [64]:
#catboost implementation

In [65]:
!pip install catboost



In [66]:
from catboost import CatBoostRegressor

In [67]:
cb = CatBoostRegressor(loss_function='RMSE')

In [68]:
cb.fit(X_train,y_train)

Learning rate set to 0.032525
0:	learn: 1.6470315	total: 144ms	remaining: 2m 24s
1:	learn: 1.6217236	total: 145ms	remaining: 1m 12s
2:	learn: 1.5969242	total: 146ms	remaining: 48.5s
3:	learn: 1.5732989	total: 147ms	remaining: 36.5s
4:	learn: 1.5512183	total: 147ms	remaining: 29.3s
5:	learn: 1.5287645	total: 148ms	remaining: 24.5s
6:	learn: 1.5048256	total: 149ms	remaining: 21.1s
7:	learn: 1.4870277	total: 150ms	remaining: 18.6s
8:	learn: 1.4675246	total: 151ms	remaining: 16.6s
9:	learn: 1.4439931	total: 151ms	remaining: 15s
10:	learn: 1.4230721	total: 152ms	remaining: 13.7s
11:	learn: 1.4075392	total: 153ms	remaining: 12.6s
12:	learn: 1.3886248	total: 154ms	remaining: 11.7s
13:	learn: 1.3712771	total: 155ms	remaining: 10.9s
14:	learn: 1.3541266	total: 155ms	remaining: 10.2s
15:	learn: 1.3382696	total: 156ms	remaining: 9.6s
16:	learn: 1.3227051	total: 157ms	remaining: 9.06s
17:	learn: 1.3075648	total: 157ms	remaining: 8.58s
18:	learn: 1.2911885	total: 158ms	remaining: 8.16s
19:	learn: 1

307:	learn: 0.3688651	total: 358ms	remaining: 805ms
308:	learn: 0.3679294	total: 359ms	remaining: 803ms
309:	learn: 0.3669795	total: 361ms	remaining: 803ms
310:	learn: 0.3656143	total: 361ms	remaining: 801ms
311:	learn: 0.3649610	total: 362ms	remaining: 798ms
312:	learn: 0.3646665	total: 363ms	remaining: 796ms
313:	learn: 0.3636343	total: 363ms	remaining: 794ms
314:	learn: 0.3626365	total: 364ms	remaining: 792ms
315:	learn: 0.3616274	total: 365ms	remaining: 790ms
316:	learn: 0.3609652	total: 365ms	remaining: 787ms
317:	learn: 0.3598460	total: 366ms	remaining: 785ms
318:	learn: 0.3596061	total: 367ms	remaining: 783ms
319:	learn: 0.3583145	total: 368ms	remaining: 781ms
320:	learn: 0.3572217	total: 368ms	remaining: 779ms
321:	learn: 0.3557652	total: 369ms	remaining: 777ms
322:	learn: 0.3555770	total: 370ms	remaining: 775ms
323:	learn: 0.3552223	total: 370ms	remaining: 773ms
324:	learn: 0.3550231	total: 371ms	remaining: 771ms
325:	learn: 0.3539533	total: 372ms	remaining: 769ms
326:	learn: 

532:	learn: 0.2213336	total: 532ms	remaining: 466ms
533:	learn: 0.2207579	total: 533ms	remaining: 465ms
534:	learn: 0.2204417	total: 534ms	remaining: 464ms
535:	learn: 0.2200800	total: 535ms	remaining: 463ms
536:	learn: 0.2196849	total: 535ms	remaining: 462ms
537:	learn: 0.2192210	total: 536ms	remaining: 460ms
538:	learn: 0.2190916	total: 537ms	remaining: 459ms
539:	learn: 0.2182846	total: 538ms	remaining: 458ms
540:	learn: 0.2178944	total: 538ms	remaining: 457ms
541:	learn: 0.2173168	total: 539ms	remaining: 455ms
542:	learn: 0.2167387	total: 540ms	remaining: 454ms
543:	learn: 0.2163626	total: 541ms	remaining: 453ms
544:	learn: 0.2159962	total: 542ms	remaining: 452ms
545:	learn: 0.2156759	total: 542ms	remaining: 451ms
546:	learn: 0.2149630	total: 543ms	remaining: 450ms
547:	learn: 0.2144494	total: 544ms	remaining: 449ms
548:	learn: 0.2142752	total: 545ms	remaining: 448ms
549:	learn: 0.2136716	total: 546ms	remaining: 447ms
550:	learn: 0.2132596	total: 547ms	remaining: 446ms
551:	learn: 

767:	learn: 0.1456449	total: 706ms	remaining: 213ms
768:	learn: 0.1454410	total: 707ms	remaining: 212ms
769:	learn: 0.1452388	total: 708ms	remaining: 211ms
770:	learn: 0.1448755	total: 709ms	remaining: 210ms
771:	learn: 0.1443291	total: 709ms	remaining: 210ms
772:	learn: 0.1441942	total: 710ms	remaining: 209ms
773:	learn: 0.1439713	total: 711ms	remaining: 208ms
774:	learn: 0.1437888	total: 712ms	remaining: 207ms
775:	learn: 0.1434570	total: 712ms	remaining: 206ms
776:	learn: 0.1428924	total: 713ms	remaining: 205ms
777:	learn: 0.1424795	total: 714ms	remaining: 204ms
778:	learn: 0.1421163	total: 715ms	remaining: 203ms
779:	learn: 0.1419260	total: 715ms	remaining: 202ms
780:	learn: 0.1417716	total: 716ms	remaining: 201ms
781:	learn: 0.1415897	total: 717ms	remaining: 200ms
782:	learn: 0.1415006	total: 718ms	remaining: 199ms
783:	learn: 0.1412159	total: 718ms	remaining: 198ms
784:	learn: 0.1408033	total: 719ms	remaining: 197ms
785:	learn: 0.1407067	total: 720ms	remaining: 196ms
786:	learn: 

992:	learn: 0.0997454	total: 879ms	remaining: 6.19ms
993:	learn: 0.0997042	total: 880ms	remaining: 5.31ms
994:	learn: 0.0995127	total: 881ms	remaining: 4.42ms
995:	learn: 0.0993822	total: 881ms	remaining: 3.54ms
996:	learn: 0.0992509	total: 882ms	remaining: 2.65ms
997:	learn: 0.0991776	total: 883ms	remaining: 1.77ms
998:	learn: 0.0990921	total: 884ms	remaining: 884us
999:	learn: 0.0987271	total: 884ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1fe3a7adad0>

In [69]:
r2_score(y_train,cb.predict(X_train))

0.9965150948379474

In [70]:
r2_score(y_test,cb.predict(X_test))

0.8871738967924186

In [71]:
#random forest

In [72]:
from sklearn.ensemble import RandomForestRegressor

In [73]:
rf = RandomForestRegressor()

In [74]:
rf.fit(X_train,y_train)

In [75]:
r2_score(y_train,rf.predict(X_train))

0.9602887431203702

In [76]:
r2_score(y_test,rf.predict(X_test))

0.7353819890588039

In [77]:
#lightgbm model

In [78]:
!pip install lightgbm



In [79]:
from lightgbm import LGBMRegressor

In [80]:
lgb = LGBMRegressor()

In [81]:
lgb.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 233, number of used features: 11
[LightGBM] [Info] Start training from score 3.099541


In [82]:
r2_score(y_train,lgb.predict(X_train))

0.9421310395533242

In [83]:
r2_score(y_test,lgb.predict(X_test))

0.7942480967405875

In [84]:
#GBDT gradient boosting decision trees

In [85]:
from sklearn.ensemble import GradientBoostingRegressor

In [86]:
gbdt = GradientBoostingRegressor()

In [87]:
gbdt.fit(X_train,y_train)

In [88]:
r2_score(y_train,gbdt.predict(X_train))

0.967769787789666

In [89]:
r2_score(y_test,gbdt.predict(X_test))

0.8753896158583337