In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from datetime import timedelta
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', message='X does not have valid feature names')

In [2]:
data = pd.read_csv('volve_field_data.csv', parse_dates=['date'])

  data = pd.read_csv('volve_field_data.csv', parse_dates=['date'])


In [3]:
data.head()

Unnamed: 0,date,down_hole_presure,down_hole_temperature,production_pipe_pressure,choke_size_pct,well_head_presure,well_head_temperature,choke_size_pressure,oil
0,2008-07-13,260.867,103.293,160.152,21.053,100.715,52.382,69.539,1735.26
1,2008-07-14,256.248,104.018,156.65,27.223,99.598,71.115,68.016,592.36
2,2008-07-15,254.983,103.967,156.333,27.278,98.65,67.591,66.949,2532.21
3,2008-07-16,251.476,104.211,155.817,30.651,95.659,73.782,63.664,3030.71
4,2008-07-17,249.346,104.339,155.761,32.563,93.585,78.035,61.338,3288.08


In [4]:
data.isna().sum()

date                        0
down_hole_presure           0
down_hole_temperature       0
production_pipe_pressure    0
choke_size_pct              0
well_head_presure           0
well_head_temperature       0
choke_size_pressure         0
oil                         0
dtype: int64

In [5]:
data = data.sort_values(by='date')

In [6]:
def duplicate_feature_oil(dt, dup=5):
    for i in range(1, dup+1):
        dt[f'oil_duplicate{i}'] = dt['oil'].shift(i)
    return dt
data = duplicate_feature_oil(data)
data

Unnamed: 0,date,down_hole_presure,down_hole_temperature,production_pipe_pressure,choke_size_pct,well_head_presure,well_head_temperature,choke_size_pressure,oil,oil_duplicate1,oil_duplicate2,oil_duplicate3,oil_duplicate4,oil_duplicate5
0,2008-07-13,260.867,103.293,160.152,21.053,100.715,52.382,69.539,1735.26,,,,,
1,2008-07-14,256.248,104.018,156.650,27.223,99.598,71.115,68.016,592.36,1735.26,,,,
2,2008-07-15,254.983,103.967,156.333,27.278,98.650,67.591,66.949,2532.21,592.36,1735.26,,,
3,2008-07-16,251.476,104.211,155.817,30.651,95.659,73.782,63.664,3030.71,2532.21,592.36,1735.26,,
4,2008-07-17,249.346,104.339,155.761,32.563,93.585,78.035,61.338,3288.08,3030.71,2532.21,592.36,1735.26,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2834,2016-07-09,268.813,100.295,239.081,82.191,29.732,87.864,4.114,102.09,106.30,106.19,98.30,101.88,100.67
2835,2016-07-10,265.923,100.338,238.398,100.000,27.524,89.154,1.959,113.38,102.09,106.30,106.19,98.30,101.88
2836,2016-07-11,267.769,100.321,238.639,91.162,29.130,89.075,3.408,108.84,113.38,102.09,106.30,106.19,98.30
2837,2016-07-12,266.002,100.346,238.274,100.000,27.729,89.504,1.938,113.84,108.84,113.38,102.09,106.30,106.19


In [7]:
data = data.dropna()
data

Unnamed: 0,date,down_hole_presure,down_hole_temperature,production_pipe_pressure,choke_size_pct,well_head_presure,well_head_temperature,choke_size_pressure,oil,oil_duplicate1,oil_duplicate2,oil_duplicate3,oil_duplicate4,oil_duplicate5
5,2008-07-18,251.502,104.334,155.279,30.052,96.223,75.750,64.305,2780.06,3288.08,3030.71,2532.21,592.36,1735.26
6,2008-07-19,251.509,104.354,155.360,30.084,96.149,75.335,64.164,2774.38,2780.06,3288.08,3030.71,2532.21,592.36
7,2008-07-20,251.458,104.378,155.352,30.118,96.106,75.617,64.128,2791.01,2774.38,2780.06,3288.08,3030.71,2532.21
8,2008-07-21,251.724,104.399,155.234,29.831,96.490,76.078,64.560,2743.10,2791.01,2774.38,2780.06,3288.08,3030.71
9,2008-07-22,251.858,104.407,155.172,29.748,96.686,77.017,64.770,2717.44,2743.10,2791.01,2774.38,2780.06,3288.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2834,2016-07-09,268.813,100.295,239.081,82.191,29.732,87.864,4.114,102.09,106.30,106.19,98.30,101.88,100.67
2835,2016-07-10,265.923,100.338,238.398,100.000,27.524,89.154,1.959,113.38,102.09,106.30,106.19,98.30,101.88
2836,2016-07-11,267.769,100.321,238.639,91.162,29.130,89.075,3.408,108.84,113.38,102.09,106.30,106.19,98.30
2837,2016-07-12,266.002,100.346,238.274,100.000,27.729,89.504,1.938,113.84,108.84,113.38,102.09,106.30,106.19


In [8]:
scaler = MinMaxScaler()
col = ['down_hole_presure', 'down_hole_temperature', 'production_pipe_pressure', 'choke_size_pct', 'well_head_presure', 
           'well_head_temperature', 'choke_size_pressure', 'oil_duplicate1', 'oil_duplicate2', 'oil_duplicate3', 'oil_duplicate4', 
           'oil_duplicate5']
data[col] = scaler.fit_transform(data[col])
data['oil'] = scaler.fit_transform(data[['oil']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = scaler.fit_transform(data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['oil'] = scaler.fit_transform(data[['oil']])


In [9]:
x = data[col]
y = data['oil']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=30)
model.fit(x_train, y_train)

In [12]:
y_pred = model.predict(x_test)
y_pred

array([0.05858348, 0.79825093, 0.05888221, 0.09061442, 0.10542925,
       0.33475097, 0.16791788, 0.1081274 , 0.20570528, 0.33131587,
       0.15520793, 0.01728766, 0.15779414, 0.30063778, 0.13632911,
       0.20543298, 0.09554101, 0.41034134, 0.47123011, 0.05765304,
       0.59951722, 0.35547302, 0.52232564, 0.22970841, 0.50631235,
       0.3011271 , 0.07759941, 0.19539376, 0.07711436, 0.20923257,
       0.07451523, 0.42168372, 0.06637104, 0.72711213, 0.16251101,
       0.06268012, 0.48080339, 0.01728941, 0.01421341, 0.0251783 ,
       0.04642164, 0.05779173, 0.55244254, 0.23389311, 0.80997475,
       0.05518015, 0.30621808, 0.52029469, 0.05923575, 0.14490473,
       0.05050673, 0.28138057, 0.02951862, 0.42206063, 0.22023583,
       0.04571715, 0.50055721, 0.04361948, 0.03221765, 0.74023881,
       0.80746141, 0.5308212 , 0.20265757, 0.0587131 , 0.05676495,
       0.05723461, 0.04689788, 0.44411064, 0.28260179, 0.49004489,
       0.17031196, 0.31755554, 0.42290963, 0.45860752, 0.54873

In [13]:
print("Model score:",model.score(x_train, y_train))

Model score: 0.9916589285434373


In [14]:
print("Model score:",model.score(x_test, y_test))

Model score: 0.9548606303568953


In [15]:
mean_squared_error = mean_squared_error(y_test, y_pred)
mean_absolute_error = mean_absolute_error(y_test, y_pred)
r2_score = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mean_squared_error:.3f}")
print(f"Mean Absolute Error: {mean_absolute_error:.3f}")
print(f"R^2 Score: {r2_score:.3f}")

Mean Squared Error: 0.002
Mean Absolute Error: 0.018
R^2 Score: 0.955


In [16]:
pred_table = pd.DataFrame({'true y':y_test,'predicted y':y_pred})
pred_table

Unnamed: 0,true y,predicted y
2134,0.057896,0.058583
189,0.818555,0.798251
2543,0.016851,0.058882
2038,0.095758,0.090614
1858,0.099446,0.105429
...,...,...
1422,0.186494,0.186643
98,0.784519,0.813244
2444,0.034742,0.034731
2269,0.053377,0.055991


In [17]:
n=int(input('enter number of days to be predicted: '))
print("future prediction of ",n," days")
def predict_future_oil_production(data, model, start_date='14-07-2016', days=n):
    pred = []
    j=-1
    try:
        start_date = pd.to_datetime(start_date, format='%d-%m-%Y')
    except ValueError as e:
        print(f"Error: {e}")
        return predictions
    
    for i in range(days):
        next_date = start_date + pd.Timedelta(days=i)
        last_row = data.iloc[j]
        input_data = last_row[['down_hole_presure', 'down_hole_temperature', 'production_pipe_pressure', 
                               'choke_size_pct', 'well_head_presure', 'well_head_temperature', 
                               'choke_size_pressure', 'oil_duplicate1', 'oil_duplicate2', 'oil_duplicate3',
                               'oil_duplicate4', 'oil_duplicate5']].values.reshape(1, -1)
        prediction = model.predict(input_data)
        prediction = scaler.inverse_transform(prediction.reshape(1, -1))
        pred.append((next_date.strftime('%d-%m-%Y'), prediction)) 
        j=j-1    
    return pred

future_pred = predict_future_oil_production(data, model, start_date='14-07-2016', days=n)
for date, prediction in future_pred:
    print(f"Date: {date}, oil_Prediction: {prediction}")

future prediction of  14  days
Date: 14-07-2016, oil_Prediction: [[113.5448]]
Date: 15-07-2016, oil_Prediction: [[114.0864]]
Date: 16-07-2016, oil_Prediction: [[107.5457]]
Date: 17-07-2016, oil_Prediction: [[112.7037]]
Date: 18-07-2016, oil_Prediction: [[102.6694]]
Date: 19-07-2016, oil_Prediction: [[113.5687]]
Date: 20-07-2016, oil_Prediction: [[112.3328]]
Date: 21-07-2016, oil_Prediction: [[99.9865]]
Date: 22-07-2016, oil_Prediction: [[100.9578]]
Date: 23-07-2016, oil_Prediction: [[100.9357]]
Date: 24-07-2016, oil_Prediction: [[101.9182]]
Date: 25-07-2016, oil_Prediction: [[101.8913]]
Date: 26-07-2016, oil_Prediction: [[101.2398]]
Date: 27-07-2016, oil_Prediction: [[101.7356]]
