In [3]:
from airflow import DAG
from datetime import datetime,timedelta
from airflow.operators.python import PythonOperator

## Projects
import pandas as pd
import numpy as np
import joblib
import os
import time
import datetime
from datetime import timedelta,date,datetime
import configparser as cp
from sqlalchemy import create_engine
import mysql.connector
import joblib
from pytz import timezone
import pytz

# Stats
import math
import statsmodels.formula.api as smf
import statsmodels.api as sm

  import pandas.util.testing as tm


# Environment Set-up

In [None]:
## Airflow ##
linear_analysis_arg={'owner':'airflow',
                     'depends_on_past':False,
                     'start_date':datetime.datetime(2022,3,1),
                     'retries':1,
                     'retry_delay':timedelta(minutes=10)
                    }

## Database ##
# update to MySQL hook in Airflow
config=cp.ConfigParser()
config.read('/home/ubuntu/certi/db_login.txt')
db_config=config['ivan_db']

engine=create_engine('mysql+mysqlconnector://{0:s}:{1:s}@{2:s}/{3:s}'.format(db_config['userid'],
                                                                             db_config['pwd'],
                                                                             db_config['hostname'],
                                                                             'STOCK_PRED'
                                                                            ))
stock_mapping=pd.read_sql("""SELECT * 
                             FROM STOCK_PRED.NYSE_NASDAQ_TICKERS
                          """,con=engine)
print(stock_mapping.shape)
print(stock_mapping.Symbol.nunique())


## Regression ## 
def linear_reg_analysis_for(df):
    lr_model=smf.ols('Close ~ DATE_ORDER',data=df).fit()
    #lr_model=sm.OLS(x.Close,x.DATE_ORDER).fit()
    
    model_result={#'Stock':df.Stock[0],
                  'R_squared':[lr_model.rsquared],
                  'Coef':[lr_model.params[1]],
                  'P_values':[lr_model.pvalues[1]],
                  
                  'Start_Date':df['Date'].min(),
                  'End_Date':df['Date'].max(),
                  'Num_records':[df.shape[0]],
                  'Num_records_dist':[df.Date.nunique()]  
                 }
    
    return pd.DataFrame(model_result)

# Data Loading & Cleaning
* Save the final result as pkl

In [1]:
def stock_data_cleaning(file_path=''):
    ## 1 Data Loading ##
    # 1.1 Stock data - last 60 days
    df=pd.read_sql("""SELECT * 
                      FROM STOCK_PRED.ALL_STOCK_HIST
                      WHERE DATE>=CURDATE()-INTERVAL 60 DAY
                             """,
                   con=engine)
    print(df.shape)
    print(df['Date'].max(),df['Date'].min())
    
    if df.loc[df.Date==df.Date.max(),:].shape[0]>5000:
        ## 2 Data Cleaning ##
        # 2.1 Remove NAs
        df_1=df.dropna(axis=0,how='any')
    
        # 2.2 Remove accounts with Negative Stock price
        negative_stocks=df_1.loc[df_1.Close<0,'Stock'].unique()
        df_1=df_1.loc[~df_1.Stock.isin(negative_stocks),:]
    
        # 2.3 Keep active stocks
        active_stocks=df_1.loc[df_1.Date==df_1.Date.max(),'Stock'].to_list()
        df_2=df_1.loc[df_1.Stock.isin(active_stocks),:].reset_index(drop=True)
    
        # 2.4 Add DATE_ORDER
        df_2.loc[:,'DATE_ORDER']=df_2.groupby('Stock').Date.transform(lambda x:x.rank(method='first',ascending=True))
    
        # 2.4 Merging
        df_3=pd.merge(df_2,
                      stock_mapping.loc[:,['Symbol','Name','Country','IPOYear','Sector','Industry']],
                      how='left',
                      left_on='Stock',
                      right_on='Symbol'
                     )
        df_3.drop(['Symbol'],axis=1,inplace=True)
        df_3.rename(columns={'DAY_ORDER':'DATE_ORDER'},inplace=True)
        df_3.sort_values(by=['Stock','Date'],ascending=True,inplace=True)
    
        ## 3. Saving ##
        joblib.dump(df_3,file_path)
        
    else:
        print('Data issue: latest data {:s} only as {:,.0f} rows'.format(df.Date.max().strftime('%Y-%m-%d'),
                                                                         df.loc[df.Date==df.Date.max(),:].shape[0]
                                                                        ))

    

# Model Implementing

In [None]:
def impl_linear_reg(file_path=''):
    ## 1 Data Loading & Preprocessing ##
    df_3=joblib.load(file_path)
    print(df_3.shape)
    print(df_3.Date.max())
    
    ## 2 Modeling Implementing ##
    linear_reg_sum=df_3.groupby(['Stock']).apply(linear_reg_analysis_for).reset_index(drop=False)
    print(linear_reg_sum.shape)
    
    ## 3. Processing ##
    # 3.1 Add new columns
    linear_reg_sum.loc[:,'WT_Coef']=linear_reg_sum.R_squared*linear_reg_sum.Coef
    linear_reg_sum.loc[:,'Model_date']=datetime.now(tz=pytz.utc).astimezone(timezone('US/Pacific'))
    
    # 3.2 Additional tables
    stock_strt_end_price=df_3.groupby('Stock').agg(start_price=('Close','first'),
                                                   end_price=('Close','end')
                                                  ).reset_index(drop=False)
    linear_reg_sum_2=pd.merge(linear_reg_sum,
                              stock_strt_end_price,
                              how='left',
                              on='Stock'
                             ).assign(growth_rate=lambda x:(x.end_price-x.start_price)/x.start_price)
    
    linear_reg_sum_2=pd.merge(linear_reg_sum_2,
                              stock_mapping.loc[:,['Symbol','Name','Industry','SE']],
                              how='left',
                              left_on='Stock',
                              right_on='Symbol')
    
    linear_reg_sum_2.drop('level_1',axis=1,inplace=True)
    
    
    
    
    

In [5]:
curr_date=datetime.now(tz=pytz.utc)
print(curr_date)
curr_date.astimezone(timezone('US/Pacific'))

2022-03-02 06:12:27.783721+00:00


datetime.datetime(2022, 3, 1, 22, 12, 27, 783721, tzinfo=<DstTzInfo 'US/Pacific' PST-1 day, 16:00:00 STD>)