In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Method for calculating MAE in percents

In [3]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

### Reading and splitting data

In [4]:
df = pd.read_csv('DATA/forProcessing.csv')
df_a = df.dropna()
X = df_a.drop('receive_pressure', axis = 1)
y = df_a['receive_pressure']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

### Feature scaling

In [6]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Converting into LGBM dataset

In [7]:
d_train = lgb.Dataset(x_train, label=y_train)

### Setting up parameters

In [8]:
params = {}
params['learning_rate'] = 0.01
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = {'l2', 'l1'}
params['sub_feature'] = 0.8
params['num_leaves'] = 20
params['min_data'] = 3
params['max_depth'] = 15

params['feature_fraction'] = .99
params['bagging_fraction'] = .99
params['bagging_freq'] = 3

### Training

In [9]:
gbm = lgb.train(params, d_train, 10000)

### Getting predictions

In [10]:
y_pred=gbm.predict(x_test, num_iteration=gbm.best_iteration)

In [11]:
print('The RMSE is:', mean_squared_error(y_test, y_pred), " and the MAE is: ", mean_absolute_error(y_test, y_pred))
print("The the MAPE is: ", mean_absolute_percentage_error(y_test, y_pred))

The RMSE is: 31.27152978854876  and the MAE is:  2.660755195243423
The the MAPE is:  10.124580207549286


### Saving model and predictions

In [12]:
gbm.save_model('model.txt')
subm = pd.DataFrame(y_pred)
subm.to_csv('output.csv', index=False)

### Excluding outliers

In [13]:
df_b = df_a[ (df_a.receive_pressure < df_a.receive_pressure.quantile(.99)) 
            & (df_a.receive_pressure > df_a.receive_pressure.quantile(.01)) ]

In [14]:
low = .01
high = .99
df_extreme = df_a.quantile([low, high])
df_b = df_a.apply(lambda x: x[(x>df_extreme.loc[low,x.name]) & 
                                    (x < df_extreme.loc[high,x.name])], axis=0)
df_b.dropna(inplace=True)
df_b.describe()

Unnamed: 0,diameter,depth,buffer_pressure,line_pressure,dynamic_height,annulus_pressure,water_cut,receive_pressure
count,526.0,526.0,526.0,526.0,526.0,526.0,526.0,526.0
mean,153.305703,2496.230608,12.329087,10.824905,2392.705323,12.346198,49.645475,20.269202
std,3.910865,196.191778,6.630105,6.50509,236.816486,6.579059,20.171019,11.487239
min,147.0,2229.54,6.1,5.1,1608.0,4.0,12.0,8.0
25%,150.2,2349.11,7.0,6.0,2248.5,7.2,34.0,11.0
50%,150.2,2435.0,7.5,6.3,2347.0,8.0,48.5,17.0
75%,157.2,2662.25,19.725,17.95,2591.75,19.0,63.75,26.0
max,157.2,3012.21,27.0,26.0,3007.0,27.7,93.0,86.0


In [15]:
X1 = df_b.drop('receive_pressure', axis = 1)
y1 = df_b['receive_pressure']
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.25, random_state = 0)

sc1 = StandardScaler()
x_train1 = sc1.fit_transform(x_train1)
x_test1 = sc1.transform(x_test1)

d_train1 = lgb.Dataset(x_train1, label=y_train1)

gbm1 = lgb.train(params, d_train1, 10000)

y_pred1=gbm1.predict(x_test1, num_iteration=gbm1.best_iteration)

print('The RMSE is:', mean_squared_error(y_test1, y_pred1), " and the MAE is: ", mean_absolute_error(y_test1, y_pred1))
print("The the MAPE is: ", mean_absolute_percentage_error(y_test1, y_pred1))

The RMSE is: 33.55834759930384  and the MAE is:  2.8830068126021366
The the MAPE is:  13.320255276317585


### Excluding low corelated features

In [16]:
df_c = df_a.drop(['water_cut'], axis = 1)
df_c.head()

Unnamed: 0,diameter,depth,buffer_pressure,line_pressure,dynamic_height,annulus_pressure,receive_pressure
0,146.8,2349.0,20.2,19.2,2332.0,20.5,22.0
1,146.8,2349.0,20.4,19.4,2342.0,20.0,25.0
2,146.8,2349.0,20.6,19.6,2344.0,21.1,24.0
3,146.8,2349.0,20.0,19.3,2320.0,20.8,24.0
4,146.8,2349.0,20.7,19.7,2339.0,21.2,24.0


In [17]:
X2 = df_c.drop('receive_pressure', axis = 1)
y2 = df_c['receive_pressure']
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.25, random_state = 0)

sc2 = StandardScaler()
x_train2 = sc2.fit_transform(x_train2)
x_test2 = sc2.transform(x_test2)

d_train2 = lgb.Dataset(x_train2, label=y_train2)

gbm2 = lgb.train(params, d_train2, 10000)

y_pred2=gbm2.predict(x_test2, num_iteration=gbm2.best_iteration)

print('The RMSE is:', mean_squared_error(y_test2, y_pred2), " and the MAE is: ", mean_absolute_error(y_test2, y_pred2))
print("The the MAPE is: ", mean_absolute_percentage_error(y_test2, y_pred2))

The RMSE is: 33.43471582623008  and the MAE is:  2.7651360664742746
The the MAPE is:  10.554677547580175
