In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats as st

In [24]:
def file_path(x):
    y = pd.read_csv(x)
    return y

region_0 = file_path(r"C:/Users/kevin/datasets/geo_data_0.csv")
region_1 = file_path(r"C:/Users/kevin/datasets/geo_data_1.csv")
region_2 = file_path(r"C:/Users/kevin/datasets/geo_data_2.csv")

In [25]:
print(region_0.info())
print(region_1.info())
print(region_2.info())
print(region_0)
print(region_1)
print(region_2)
print(region_0.shape)
print(region_1.shape)
print(region_2.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column  

We can imediately assume from generating the tables information, and table view there is no missing values in either dataframe.

The data in the dataframes are also already scaled out. We do not need to do any standard scaling to make sure the data is similar to one another (mean of 0 and variance of 1)

In [26]:
print(region_0.duplicated().sum())
print(region_1.duplicated().sum())
print(region_2.duplicated().sum())

0
0
0


There is no duplicated values in our dataets

# Train and Test Model for Each Region

In [27]:
def train_linear_regression(data):
    features = data.drop(['id', 'product'], axis=1)
    target = data['product']
    features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.25, random_state=12345)
    model = LinearRegression()
    model.fit(features_train, target_train)
    predicted_valid = model.predict(features_valid)
    mse = mean_squared_error(target_valid, predicted_valid)
    rmse = np.sqrt(mse)
    r2 = r2_score(target_valid, predicted_valid)
    predictions = pd.Series(predicted_valid, index=target_valid.index)
    answers = target_valid.reset_index(drop=True).mean()
    return rmse, r2, answers, predicted_valid, target_valid

rmse_0, r2_0, answers_0, predicted_valid_0, target_valid_0 = train_linear_regression(region_0)
rmse_1, r2_1, answers_1, predicted_valid_1, target_valid_1 = train_linear_regression(region_1)
rmse_2, r2_2, answers_2, predicted_valid_2, target_valid_2 = train_linear_regression(region_2)


In [28]:
print('Region_0')
print("RMSE:", rmse_0)
print("Average volume of predicted reserves:", answers_0)
print("R2:", r2_0)

Region_0
RMSE: 37.5794217150813
Average volume of predicted reserves: 92.07859674082941
R2: 0.27994321524487786


In [29]:
print('Region_1')
print("RMSE:", rmse_1)
print("Average volume of predicted reserves:", answers_1)
print("R2:", r2_1)

Region_1
RMSE: 0.8930992867756166
Average volume of predicted reserves: 68.72313602437494
R2: 0.9996233978805127


In [30]:
print('Region_2')
print("RMSE:", rmse_2)
print("Average volume of predicted reserves:", answers_2)
print("R2:", r2_2)

Region_2
RMSE: 40.02970873393434
Average volume of predicted reserves: 94.88423280885489
R2: 0.20524758386040443


# Profit Calculations

We must adhere to the following conditions:

When exploring the region, a study of 500 points is carried with picking the best 200 points for the profit calculation.

The budget for development of 200 oil wells is 100 USD million.

One barrel of raw materials brings 4.5 USD of revenue The revenue from one unit of product is 4,500 dollars (volume of reserves is in thousand barrels).

After the risk evaluation, keep only the regions with the risk of losses lower than 2.5%. From the ones that fit the criteria, the region with the highest average profit should be selected.

In [31]:
budget = 100_000_000
all_wells = 500
wells = 200
risk = 2.5 # Represented as a %

budget_per_well = budget / wells
print("Budget per well:", budget_per_well, "USD")

revenue_per_unit = 4_500
print("Revenue per unit of product:", revenue_per_unit, "USD")

reserves_for_profit = budget_per_well / revenue_per_unit
print("Volume of reserves sufficient for developing a new well without losses:", reserves_for_profit, "thousand barrels")

Budget per well: 500000.0 USD
Revenue per unit of product: 4500 USD
Volume of reserves sufficient for developing a new well without losses: 111.11111111111111 thousand barrels


In [32]:
def profit(target, predictions):
    target = pd.Series(target).reset_index(drop=True)
    predictions = pd.Series(predictions)
    pred_sorted = predictions.sort_values(ascending=False)
    selected = target[pred_sorted.index][:wells]
    return (selected.sum()* (revenue_per_unit)) - budget

print('Profit for Region_0:','$',profit(target_valid_0, predicted_valid_0))
print('Profit for Region_1:','$',profit(target_valid_1, predicted_valid_1))
print('Profit for Region_2:','$',profit(target_valid_2, predicted_valid_2))

Profit for Region_0: $ 33208260.43139851
Profit for Region_1: $ 24150866.966815114
Profit for Region_2: $ 27103499.635998324


In [33]:
def bootstrap(target, predictions):
    target = pd.Series(target).reset_index(drop=True)
    state = np.random.RandomState(12345)
    values = []
    for i in range(1000):
        target_subsample = target.sample(n=all_wells, replace=True,random_state=state)
        predictions_subsample = predictions[target_subsample.index]
        values.append(profit(target_subsample, predictions_subsample))
        
    values = pd.Series(values)
        
    print('Wells that make a profit:',values.gt(0).sum())
    print('Proportion of wells that make a profit:', values.gt(0).sum()/len(values))
    

    return values
        


In [34]:
print('Region_0')
profit_bs_0 = bootstrap(target_valid_0,predicted_valid_0)
print('Average Profit of Region_0:', profit_bs_0.mean())

Region_0
Wells that make a profit: 931
Proportion of wells that make a profit: 0.931
Average Profit of Region_0: 3961649.84802371


In [35]:
print('Region_1')
profit_bs_1 = bootstrap(target_valid_1,predicted_valid_1)
print('Average Profit of Region_1:', profit_bs_1.mean())

Region_1
Wells that make a profit: 985
Proportion of wells that make a profit: 0.985
Average Profit of Region_1: 4560451.057866613


In [36]:
print('Region_2')
profit_bs_2 = bootstrap(target_valid_2,predicted_valid_2)
print('Average Profit of Region_2:', profit_bs_2.mean())

Region_2
Wells that make a profit: 924
Proportion of wells that make a profit: 0.924
Average Profit of Region_2: 4044038.665683569


In [37]:
def conf_interval(regions):
    confidence_interval = st.t.interval(.95,len(regions)-1,regions.mean(),regions.sem())
    risk_evaluation = (regions < .025).sum() / len(regions)
    print('95% Confidence Interval is:', confidence_interval)
    print('Risk of Losses:', risk_evaluation*100,'%')
    return 


In [38]:
print('Region_0')
conf_interval(profit_bs_0)

Region_0
95% Confidence Interval is: (3796203.151479724, 4127096.544567696)
Risk of Losses: 6.9 %


In [39]:
print('Region_1')
conf_interval(profit_bs_1)

Region_1
95% Confidence Interval is: (4431472.486639011, 4689429.629094216)
Risk of Losses: 1.5 %


In [40]:
print('Region_2')
conf_interval(profit_bs_2)

Region_2
95% Confidence Interval is: (3874457.9747128054, 4213619.356654333)
Risk of Losses: 7.6 %


# Conclusion

All regions in the model showcase high levels of profit. Although, based on our conditions, validation testing, and calculations, it loooks like Region_1 has the highest profit and has a risk of loss less than 2.5%. 

#### Region_1 dataset is recommended 