In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
%matplotlib inline

In [77]:
names = ['API','Surf_X','Surf_Y','Date_Drilling','Date_Completion','Date_Production','Lateral_Length','Depth_TVD_PPLS','Erosion_PPLS','Pressure_PPLS','TOC_PPLS','Vcarb_PPLS','Vsand_PPLS','Vclay_PPLS','PR_PPLS','YM_PPLS','RHOB_PPLS','Res_PPLS','GR_PPLS','DT_PPLS','DTs_PPLS','Temperature','Temp_Anomaly','S3Tect_PPLS','S3_contrast_PPLS','Heat_Flow','Zone','Nbr_Stages','Frac_Gradient','Proppant_Designed','Proppant_in_Formation','Avg_Breakdown_Pressure','Avg_Treating_Pressure','Max_Treating_pressure','Min_Treating_Pressure','Avg_Rate_Slurry','Max_Rate_Slurry','Min_Rate_Slurry','ShutInPressure_Fil','ShutInPressure_Initial','ISIP','Shot_Density','Shot_Total','Proppant_per_ft','Stage_Spacing','GasCum360','OilCum360']

df_data = pd.read_csv('./TrainSample.csv', 
                        header = None, 
                        sep = ';',
                        decimal = ',',
                        names = names,
                        skiprows = 1,
                        na_filter = True,
                        parse_dates = ['Date_Drilling','Date_Completion','Date_Production'],
                        dtype = {col: np.float32 for col in names}
                       )

df_test = pd.read_csv('./TestSample.csv',
                      header = None, 
                      sep = ';', 
                      decimal = ',', 
                      names = names,
                      skiprows = 1,
                      na_filter = True,
                      parse_dates = ['Date_Drilling','Date_Completion','Date_Production'],
                      dtype = {col: np.float32 for col in names}
                     )

#df_data.set_index('API')

df_test.head(5)

Unnamed: 0,API,Surf_X,Surf_Y,Date_Drilling,Date_Completion,Date_Production,Lateral_Length,Depth_TVD_PPLS,Erosion_PPLS,Pressure_PPLS,...,Min_Rate_Slurry,ShutInPressure_Fil,ShutInPressure_Initial,ISIP,Shot_Density,Shot_Total,Proppant_per_ft,Stage_Spacing,GasCum360,OilCum360
0,633.0,0.201275,-0.577329,2013-06-23,2013-10-26,2014-10-01,-0.694826,0.641631,-0.356227,1.410748,...,0.667402,,-0.740473,0.131374,0.187082,2.409201,0.16014,-0.710739,,
1,587.0,0.222609,0.670307,2012-02-06,2012-03-07,2013-05-08,-0.479516,-1.377997,-1.812483,1.605136,...,0.285738,-0.9196,-1.103957,-1.282464,0.667339,-0.56075,-1.149666,2.085649,,
2,264.0,0.022105,0.404387,2013-05-19,2013-07-24,2013-06-10,0.310782,-0.775745,0.243462,-0.136014,...,0.459046,0.614126,1.029028,1.103339,-0.773433,0.924226,-0.585869,-0.137427,,
3,680.0,0.384359,0.372793,2012-08-18,2012-12-19,2013-12-06,-0.624508,-1.082974,-1.922374,1.44363,...,0.3321,-0.782232,0.450368,-0.136076,0.667339,-0.56075,0.595622,1.88726,,
4,443.0,-0.138513,0.408803,2013-11-09,2014-03-29,2014-11-05,0.338785,-0.616106,0.556826,-1.011425,...,0.32349,,0.743256,0.835687,-1.25369,0.181738,0.151633,-0.216512,,


In [78]:
##Définition des labels

y = {'GasCum360' : df_data['GasCum360'],
     'OilCum360' : df_data['OilCum360']}

df_targetgas = pd.DataFrame(y)

In [79]:
cols_date = ['Date_Drilling','Date_Completion','Date_Production', 'GasCum360', 'OilCum360']

df_data.drop(cols_date, 1, inplace=True)
df_test.drop(cols_date, 1, inplace=True)

In [80]:
cols_with_nan = ['Pressure_PPLS','Nbr_Stages','Frac_Gradient','Proppant_Designed','Proppant_in_Formation','Avg_Breakdown_Pressure','Avg_Treating_Pressure','Max_Treating_pressure','Min_Treating_Pressure','Avg_Rate_Slurry','Max_Rate_Slurry','Min_Rate_Slurry','ShutInPressure_Fil','ShutInPressure_Initial','ISIP','Shot_Density','Shot_Total','Proppant_per_ft','Stage_Spacing']

df_data.drop(cols_with_nan, axis=1, inplace=True)
df_test.drop(cols_with_nan, axis=1, inplace=True)

In [81]:
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(df_data, df_target, test_size=0.2, random_state=0)

In [82]:
#x_testFixed = x_test.fillna(x_test.median())
#x_trainFixed = x_train.fillna(x_train.median())

In [83]:
## Fit regression model

regr_1 = MultiOutputRegressor(SVR(kernel="rbf"))
regr_1.fit(X_train, y_train)

## Predict

y_predictSVR = regr_1.predict(X_test)

In [84]:
## Metrics training

from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_test, y_predictSVR),
      regr_1.score(X_train, y_train))



0.765906711684 0.658412214434


In [85]:
## Fit regression model

regr_2 = MultiOutputRegressor(SVR(kernel="rbf"))
regr_2.fit(df_data, df_target)

## Predict

y_predictFin = regr_2.predict(df_test)

regr_2.score(df_data, df_target)

0.66974809186773188

In [86]:
gascumpred = []
oilcumpred = []

gascumpred = [element[0] for element in y_predictFin]
oilcumpred = [element[1] for element in y_predictFin]

In [87]:
gascumpred_max = max(gascumpred)
gascumpred_min = min(gascumpred)
gascumpred_mean = np.mean(gascumpred)

oilcumpred_max = max(oilcumpred)
oilcumpred_min = min(oilcumpred)
oilcumpred_mean = np.mean(oilcumpred)

In [97]:
print(gascumpred_mean, oilcumpred_mean)

-0.166587304404 -0.149951432711


In [89]:
(gascumpred_max - gascumpred_min) / 4

0.36646252385990608

In [90]:
GasCum360_inf = gascumpred - abs(gascumpred_mean)
GasCum360_sup = gascumpred + abs(gascumpred_mean)

OilCum360_inf = oilcumpred - abs(oilcumpred_mean)
OilCum360_sup = oilcumpred + abs(oilcumpred_mean)


In [91]:
print(GasCum360_inf[:10], GasCum360_sup[:10])

[-0.87308519 -0.64517337 -0.2154728  -0.41082498 -0.02020559 -0.40168907
 -0.57212441 -0.42795879 -0.29498124 -0.13096261] [-0.53991058 -0.31199876  0.11770181 -0.07765037  0.31296902 -0.06851446
 -0.2389498  -0.09478418  0.03819337  0.202212  ]


In [92]:
GasCum360_inf.tolist
GasCum360_sup.tolist

OilCum360_inf.tolist
OilCum360_sup.tolist

<function ndarray.tolist>

In [93]:
## Output

id_test = df_test['API'].values.tolist()

output = pd.DataFrame({'API': id_test,
                       'GasCum360_INF': GasCum360_inf,
                       'GasCum360_SUP': GasCum360_sup,
                       'OilCum360_INF': OilCum360_inf,
                       'OilCum360_SUP': OilCum360_sup},
                      index=id_test
                     )

output.head()

output.to_csv('coche-julien-challenge-total.csv', index=False, sep= ';', decimal=',')

In [94]:
"""
Rappport 1 :

Importation correcte des données
Début du travail exploratoire, affichage du head et description des données
Beaucoup de colonnes sont incomplètes => identification et élimination de ces colonnes
Observation des composantes corrélées.

Rapport 2 :

Elimination des colonnes incomplètes
Exploration des données de sorties
Choix de l'algorithme - DecisienTreeRegressor / AdaBoost

Rapport 3 :

Fin elimination des colonnes incompletes
Fin du formatage des données
Debut mise en place de la cross_validation
Debut mise en place des metrics
"""

"\nRappport 1 :\n\nImportation correcte des données\nDébut du travail exploratoire, affichage du head et description des données\nBeaucoup de colonnes sont incomplètes => identification et élimination de ces colonnes\nObservation des composantes corrélées.\n\nRapport 2 :\n\nElimination des colonnes incomplètes\nExploration des données de sorties\nChoix de l'algorithme - DecisienTreeRegressor / AdaBoost\n\nRapport 3 :\n\nFin elimination des colonnes incompletes\nFin du formatage des données\nDebut mise en place de la cross_validation\nDebut mise en place des metrics\n"

In [95]:
output.describe()

Unnamed: 0,API,GasCum360_INF,GasCum360_SUP,OilCum360_INF,OilCum360_SUP
count,235.0,235.0,235.0,235.0,235.0
mean,392.697872,-0.333175,-3.7794830000000004e-18,-0.299903,-1.889741e-18
std,231.284979,0.245196,0.2451958,0.235955,0.2359553
min,3.0,-0.914609,-0.5814339,-0.79623,-0.4963267
25%,183.5,-0.492394,-0.1592194,-0.443999,-0.1440958
50%,394.0,-0.376774,-0.04359909,-0.31835,-0.01844703
75%,590.5,-0.203591,0.1295833,-0.189777,0.1101256
max,786.0,0.551242,0.8844162,0.619771,0.9196739
