In [157]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import AdaBoostRegressor

In [158]:
## Importation des fichiers qui nous intéressent

names = ['API','Surf_X','Surf_Y','Date_Drilling','Date_Completion','Date_Production','Lateral_Length','Depth_TVD_PPLS','Erosion_PPLS','Pressure_PPLS','TOC_PPLS','Vcarb_PPLS','Vsand_PPLS','Vclay_PPLS','PR_PPLS','YM_PPLS','RHOB_PPLS','Res_PPLS','GR_PPLS','DT_PPLS','DTs_PPLS','Temperature','Temp_Anomaly','S3Tect_PPLS','S3_contrast_PPLS','Heat_Flow','Zone','Nbr_Stages','Frac_Gradient','Proppant_Designed','Proppant_in_Formation','Avg_Breakdown_Pressure','Avg_Treating_Pressure','Max_Treating_pressure','Min_Treating_Pressure','Avg_Rate_Slurry','Max_Rate_Slurry','Min_Rate_Slurry','ShutInPressure_Fil','ShutInPressure_Initial','ISIP','Shot_Density','Shot_Total','Proppant_per_ft','Stage_Spacing','GasCum360','OilCum360']

df_data = pd.read_csv('./TrainSample.csv', 
                        header = None, 
                        sep = ';',
                        decimal = ',',
                        names = names,
                        skiprows = 1,
                        index_col=0,
                        parse_dates = ['Date_Drilling','Date_Completion','Date_Production'],
                        dtype = {col: np.float32 for col in names}
                       )

df_test = pd.read_csv('./TestSample.csv',
                      header = None, 
                      sep = ';', 
                      decimal = ',', 
                      names = names,
                      skiprows = 1,
                      index_col=0,
                      parse_dates = ['Date_Drilling','Date_Completion','Date_Production'],
                      dtype = {col: np.float32 for col in names}
                     )

In [159]:
## Outliers

df_data.loc[746, 'Frac_Gradient'] = np.nan
df_data.loc[553, 'ISIP'] = np.nan
df_data.loc[681, 'Min_Rate_Slurry'] = np.nan
df_data.loc[424, 'Res_PPLS'] = np.nan
df_data.loc[456, 'ShutInPressure_Fil'] = np.nan
df_data.loc[723, 'Min_Treating_Pressure'] = np.nan
df_data.loc[205, 'Min_Treating_Pressure'] = np.nan

In [160]:
##Définition des labels

y_gas = {'GasCum360' : df_data['GasCum360']}
y_oil = {'OilCum360' : df_data['OilCum360']}

df_target_gas = pd.DataFrame(y_gas)
df_target_oil = pd.DataFrame(y_oil)

In [161]:
## Suppression des composantes pas utilisées

cols_date = ['Date_Drilling','Date_Completion','Date_Production', 'GasCum360', 'OilCum360']

df_data.drop(cols_date, 1, inplace=True)
df_test.drop(cols_date, 1, inplace=True)

In [162]:
## Imputation des valeurs manquantes

df_data.dropna()
df_test.dropna()
df_data = df_data.fillna(df_data.mean())
df_test = df_test.fillna(df_data.mean())

In [163]:
## Split des données

from sklearn import cross_validation

X_train_gas, X_test_gas, y_train_gas, y_test_gas = cross_validation.train_test_split(df_data, df_target_gas, test_size=0.15, random_state=42)
X_train_oil, X_test_oil, y_train_oil, y_test_oil = cross_validation.train_test_split(df_data, df_target_oil, test_size=0.15, random_state=42)

In [164]:
from sklearn.ensemble import GradientBoostingRegressor

regr = GradientBoostingRegressor()

In [165]:
## Définition des paramètres

K = [x for x in range(20,40)]

param_grid = [
    {
        'loss':['huber'],
        'n_estimators':[500],
        'learning_rate':[0.025],
        'max_depth':[4],
        'criterion':['friedman_mse'],
        'min_samples_split':[2],
        'min_samples_leaf':[2],
        'alpha':[0.909],
    }
]

In [166]:
## Algorithme - Choix des paramètres 

from sklearn.model_selection import GridSearchCV

n_iter = 50

clf_oil = GridSearchCV(regr, param_grid=param_grid, n_jobs=n_iter, cv=3)

clf_oil.fit(X_train_oil.values, y_train_oil.values)
y_predict_oil = clf_oil.predict(X_test_oil.values)

In [167]:
## Metrics training oil

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

retour = 'Erreur Moyenne absolue : {}\n\
Erreur Moyenne carré : {}\n\
R2 : {}\n\
Best params : {}'.format(mean_absolute_error(y_test_oil, y_predict_oil),
                mean_squared_error(y_test_oil, y_predict_oil),
                r2_score(y_test_oil, y_predict_oil),
                clf_oil.best_params_
               )
print(retour)

Erreur Moyenne absolue : 0.26029141854
Erreur Moyenne carré : 0.202361334938
R2 : 0.839807728498
Best params : {'loss': 'huber', 'learning_rate': 0.025, 'min_samples_leaf': 2, 'n_estimators': 500, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'alpha': 0.909, 'max_depth': 4}


In [168]:
## Fit regression model training

clf_gas = GridSearchCV(regr, param_grid=param_grid, n_jobs=n_iter, cv=3)

clf_gas.fit(X_train_gas.values, y_train_gas.values)
y_predict_gas = clf_gas.predict(X_test_gas.values)

In [169]:
## Metrics training gas 

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

retour = 'Erreur Moyenne absolue : {}\n\
Erreur Moyenne carré : {}\n\
R2 : {}'.format(mean_absolute_error(y_test_gas, y_predict_gas),
                mean_squared_error(y_test_gas, y_predict_gas),
                r2_score(y_test_gas, y_predict_gas),
               )

print(retour)

Erreur Moyenne absolue : 0.358683756197
Erreur Moyenne carré : 0.271159031151
R2 : 0.730065812347


In [170]:
## Fit regression model test data

clf_gas.fit(df_data, df_target_gas)
clf_oil.fit(df_data, df_target_oil)

y_predictFin_gas = clf_gas.predict(df_test)
y_predictFin_oil = clf_oil.predict(df_test)

In [171]:
gascumpred = []
oilcumpred = []

gascumpred = [element for element in y_predictFin_gas]
oilcumpred = [element for element in y_predictFin_oil]

gascumpred_mean = mean_absolute_error(y_test_gas, y_predict_gas)
oilcumpred_mean = mean_absolute_error(y_test_oil, y_predict_oil)

GasCum360_inf = gascumpred - 2.3 * abs(gascumpred_mean)
GasCum360_sup = gascumpred + 2.3 *abs(gascumpred_mean)

OilCum360_inf = oilcumpred - 2.3 * abs(oilcumpred_mean)
OilCum360_sup = oilcumpred + 2.3 * abs(oilcumpred_mean)

GasCum360_inf.tolist
GasCum360_sup.tolist

OilCum360_inf.tolist
OilCum360_sup.tolist

<function tolist>

In [172]:
y_predict_gas[:10]

array([-0.8644711 ,  0.06851813,  0.83448679, -0.03398048, -0.1326212 ,
        0.82534931, -0.42144984,  0.55126857, -0.52032883,  0.23283801])

In [173]:
y_test_gas.head(10)

Unnamed: 0_level_0,GasCum360
API,Unnamed: 1_level_1
517.0,-1.089723
694.0,-0.093905
557.0,2.583899
510.0,-0.020998
369.0,-0.796393
529.0,0.689649
709.0,-0.419608
184.0,0.456733
120.0,-0.398933
744.0,-0.26862


In [174]:
y_predict_oil[:15]

array([-0.23822072, -0.45101096, -0.63022033,  0.77081612,  0.72972447,
       -0.65202991, -0.3609976 , -0.7474481 ,  0.55341461,  0.50377493,
       -0.73685542, -0.69112591,  0.00536128,  3.1746376 , -0.18284157])

In [175]:
y_test_oil.head(15)

Unnamed: 0_level_0,OilCum360
API,Unnamed: 1_level_1
517.0,-0.84618
694.0,-0.573829
557.0,-0.451239
510.0,1.103248
369.0,1.567699
529.0,-0.661664
709.0,-0.319803
184.0,-0.738756
120.0,0.584454
744.0,0.589509


In [176]:
## Output

id_test = df_test.index.tolist()

output = pd.DataFrame({'API': id_test,
                       'GasCum360_INF': GasCum360_inf,
                       'GasCum360_SUP': GasCum360_sup,
                       'OilCum360_INF': OilCum360_inf,
                       'OilCum360_SUP': OilCum360_sup},
                      index=id_test
                     )

output.head()

output.to_csv('coche-julien-challenge-total.csv', index=False, sep= ';', decimal=',')

In [177]:
print(output.head(5))

         API  GasCum360_INF  GasCum360_SUP  OilCum360_INF  OilCum360_SUP
633.0  633.0      -1.511467       0.640635      -0.457045       1.104703
587.0  587.0      -0.782862       1.369240      -1.607697      -0.045948
264.0  264.0      -0.350802       1.801300      -1.418743       0.143005
680.0  680.0      -0.472444       1.679658      -1.505283       0.056466
443.0  443.0      -0.662858       1.489245      -0.697455       0.864293
