In [210]:
# %load_ext autoreload
# %autoreload 2


In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# from utils import evaluate_binary, log_mlflow

In [5]:
from typing import Optional
from pathlib import Path

def find_project_root() -> Optional[Path]:
    current = Path(".").resolve()
    
    while True:
        if (current / ".git").exists():
            return current
        
        if current.parent == current:
            print("WARNING: No .git dir found")
            return current
              
        current = current.parent
        

PROJECT_ROOT = find_project_root()
PROJECT_ROOT

WindowsPath('C:/Users/Harsha/Documents/ISB_AMPBA/Term5/FP2/GroupAssignment/project')

In [6]:
# Tag this cell as 'parameters'
# BASE
COUNTRY = 'US'

dataset_dir  = "datadir\\datasets"
model_dir  = "model\\linear"
transformed_data_dir = "datadir\\transformed"

### Import data

In [15]:
df = pd.read_parquet(f'{PROJECT_ROOT}\\{dataset_dir}\\{COUNTRY}.parquet')
# df["stock_name"] = INDEX
df.head()

Unnamed: 0,Date,production
0,2002-01-01,5848
1,2002-02-01,5871
2,2002-03-01,5883
3,2002-04-01,5859
4,2002-05-01,5924


### Data prep

In [16]:
df.sort_values(by='Date', ascending=True, inplace=True)  # Have to sort ascending for rolling to work correctly

In [17]:
df['Date'] =  pd.to_datetime(df['Date'])

### Create basic features

In [18]:
df['prev'] = df['production'].shift(1)
df.head()

Unnamed: 0,Date,production,prev
0,2002-01-01,5848,
1,2002-02-01,5871,5848.0
2,2002-03-01,5883,5871.0
3,2002-04-01,5859,5883.0
4,2002-05-01,5924,5859.0


In [19]:
df['sma2'] = df['prev'].rolling(window=2).mean()
df['sma5'] = df['prev'].rolling(window=5).mean()
df['sma10'] = df['prev'].rolling(window=10).mean()
df['sma20'] = df['prev'].rolling(window=20).mean()
df.head()

Unnamed: 0,Date,production,prev,sma2,sma5,sma10,sma20
0,2002-01-01,5848,,,,,
1,2002-02-01,5871,5848.0,,,,
2,2002-03-01,5883,5871.0,5859.5,,,
3,2002-04-01,5859,5883.0,5877.0,,,
4,2002-05-01,5924,5859.0,5871.0,,,


In [20]:
assert round(df['prev'].tail(5).mean(), 4) == round(df['sma5'].tail(1).item(), 4), 'Expected sma5 to be same as mean of past 5 items'
assert round(df['prev'].tail(10).mean(), 4) == round(df['sma10'].tail(1).item(), 4), 'Expected sma10 to be same as mean of past 10 items'
assert round(df['prev'].tail(20).mean(), 4) == round(df['sma20'].tail(1).item(), 4), 'Expected sma20 to be same as mean of past 20 items'

In [21]:
df['ema12'] = df['prev'].ewm(span=12, min_periods=12, adjust=False).mean()
df['ema26'] = df['prev'].ewm(span=26, min_periods=26, adjust=False).mean()

In [22]:
def difference(prev_price, moving_average):
    return (prev_price - moving_average) / prev_price

In [23]:
for col in ['sma2', 'sma5', 'sma10', 'sma20', 'ema12', 'ema26']:
    df['{}_diff'.format(col)] = difference(df['prev'], df[col])

In [24]:
df.dropna(inplace=True)

In [25]:
df.shape

(226, 15)

In [26]:
# df.drop(columns=['date', 'open'], inplace=True)

### Create label

In [27]:
df['label'] = (df['production'] > df['prev']).astype(int)
df.reset_index(inplace = True)
# df.drop(['index'], axis=1)


In [29]:
df = df.drop(['index'], axis=1)
df

Unnamed: 0,Date,production,prev,sma2,sma5,sma10,sma20,ema12,ema26,sma2_diff,sma5_diff,sma10_diff,sma20_diff,ema12_diff,ema26_diff,label
0,2004-03-01,5607,5556.0,5563.0,5580.0,5613.8,5647.80,5623.582048,5682.746116,-0.001260,-0.004320,-0.010403,-0.016523,-0.012164,-0.022812,1
1,2004-04-01,5526,5607.0,5581.5,5574.4,5601.2,5639.65,5621.030964,5677.135292,0.004548,0.005814,0.001034,-0.005823,-0.002502,-0.012509,0
2,2004-05-01,5548,5526.0,5566.5,5567.6,5583.7,5625.40,5606.410815,5665.940086,-0.007329,-0.007528,-0.010442,-0.017988,-0.014551,-0.025324,1
3,2004-06-01,5398,5548.0,5537.0,5561.4,5585.9,5632.25,5597.424536,5657.203783,0.001983,-0.002415,-0.006831,-0.015186,-0.008909,-0.019683,0
4,2004-07-01,5458,5398.0,5473.0,5527.0,5566.2,5634.00,5566.743838,5638.003503,-0.013894,-0.023898,-0.031160,-0.043720,-0.031260,-0.044462,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,2022-08-01,12002,11844.0,11820.5,11727.8,11631.7,11370.95,11575.431144,11455.845343,0.001984,0.009811,0.017925,0.039940,0.022676,0.032772,1
222,2022-09-01,12312,12002.0,11923.0,11788.0,11675.0,11412.65,11641.057122,11496.301243,0.006582,0.017830,0.027245,0.049104,0.030074,0.042135,1
223,2022-10-01,12381,12312.0,12157.0,11916.8,11727.2,11472.05,11744.279103,11556.723374,0.012589,0.032099,0.047498,0.068222,0.046111,0.061345,1
224,2022-11-01,12375,12381.0,12346.5,12067.2,11801.9,11594.85,11842.236164,11617.780901,0.002787,0.025345,0.046773,0.063496,0.043515,0.061644,0


### Prepare dataset to store in parquet format

In [30]:
df.dtypes

Date          datetime64[ns]
production             int64
prev                 float64
sma2                 float64
sma5                 float64
sma10                float64
sma20                float64
ema12                float64
ema26                float64
sma2_diff            float64
sma5_diff            float64
sma10_diff           float64
sma20_diff           float64
ema12_diff           float64
ema26_diff           float64
label                  int32
dtype: object

In [32]:

predictors_df = df.loc[:,df.columns!="label"]
target_df = df[["label","Date"]]

datalen = len(df)
idslist= list(range(datalen))

record_ids = pd.DataFrame(data = idslist, columns = ["record_id"])

predictors_df = pd.concat(objs = [predictors_df, record_ids], axis = 1)
target_df = pd.concat(objs = [target_df, record_ids], axis = 1)

predictors_df.rename(columns = {'Date':'event_timestamp','production': 'oil_production'}, inplace = True)
target_df.rename(columns = {'Date':'event_timestamp','production': 'oil_production'}, inplace = True)

In [33]:
target_df.head()
target_df.dtypes

label                       int32
event_timestamp    datetime64[ns]
record_id                   int64
dtype: object

In [34]:
# Save FE

Path(f"{PROJECT_ROOT}\\{transformed_data_dir}\\{COUNTRY}").mkdir(parents=True, exist_ok=True)
predictors_df.to_parquet(f"{PROJECT_ROOT}\\{transformed_data_dir}\\{COUNTRY}\\{COUNTRY}_features.parquet",index=False)
target_df.to_parquet(f"{PROJECT_ROOT}\\{transformed_data_dir}\\{COUNTRY}\\{COUNTRY}_target.parquet",index=False)
