In [1]:
import pandas as pd
import numpy as np
import configparser as cp
from plotnine import *
import mysql.connector
from sqlalchemy import create_engine

In [2]:
## db connection
config=cp.ConfigParser()
config.read('/home/ubuntu/certi/db_login.txt')
db_config=config['ivan_db']


## 2. db connection
engine=create_engine('mysql+mysqlconnector://{0:s}:{1:s}@{2:s}/{3:s}'.format(db_config['userid'],
                                                                             db_config['pwd'],
                                                                             db_config['hostname'],
                                                                             'STOCK_PRED'
                                                                            ))
engine

Engine(mysql+mysqlconnector://ilong:***@ivandb.cnwzhaenwyah.us-east-2.rds.amazonaws.com:3306/STOCK_PRED)

# Data Loading

In [3]:
df=pd.read_sql("""SELECT * FROM STOCK_PRED.STOCK_RECOMMENDS_ANALYSIS_1""",con=engine)
print(df.shape)

(3633064, 16)


## Overview 

In [4]:
df.dtypes

DATE              datetime64[ns]
OPEN                     float64
HIGH                     float64
LOW                      float64
CLOSE                    float64
VOLUME                   float64
SE                        object
STOCK                     object
REFRESH_DATE      datetime64[ns]
RECOM_DATE        datetime64[ns]
RECOM_END_DATE    datetime64[ns]
FIRM                      object
TO_GRADE                  object
FROM_GRADE                object
ACTION                    object
RECOM_STOCK               object
dtype: object

### NAs & Negative 

In [5]:
## NAs
pd.DataFrame({'vals':df.isnull().sum()}).loc[lambda x:x.vals>0]

Unnamed: 0,vals
OPEN,24
HIGH,24
LOW,24
CLOSE,24
VOLUME,24


In [12]:
df.loc[df.OPEN.isnull(),:].head(5)

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOLUME,SE,STOCK,REFRESH_DATE,RECOM_DATE,RECOM_END_DATE,FIRM,TO_GRADE,FROM_GRADE,ACTION,RECOM_STOCK
80462,2021-08-02,,,,,,NYSE,CAG,2021-08-15,2021-07-14 10:32:40,2021-08-14 10:32:40,B of A Securities,Neutral,Buy,down,CAG
282398,2012-10-29,,,,,,NASDAQ,TXN,2021-08-15,2012-10-10 12:52:00,2012-11-10 12:52:00,Barclays,Equal-Weight,,main,TXN
627376,2018-12-05,,,,,,NASDAQ,GOGL,2021-08-15,2018-11-16 12:29:49,2018-12-16 12:29:49,BTIG Research,Buy,,init,GOGL
655879,2012-10-29,,,,,,NASDAQ,TXN,2021-08-15,2012-10-23 08:02:00,2012-11-23 08:02:00,Canaccord Genuity,Hold,,main,TXN
762408,2012-10-29,,,,,,NYSE,MS,2021-08-15,2012-10-19 08:28:00,2012-11-19 08:28:00,Citigroup,Neutral,,main,MS


In [7]:
## Negative
pd.DataFrame({'vals':df.select_dtypes(include=['float64']).apply(lambda x:(x<0).astype(float).sum())})

Unnamed: 0,vals
OPEN,0.0
HIGH,0.0
LOW,0.0
CLOSE,0.0
VOLUME,0.0


In [8]:
## Remove NAs
df_1=df.loc[~df.OPEN.isnull(),:].reset_index(drop=True)
print(df.shape)
print(df_1.shape)
df_1.head(5)

(3633064, 16)
(3633040, 16)


Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOLUME,SE,STOCK,REFRESH_DATE,RECOM_DATE,RECOM_END_DATE,FIRM,TO_GRADE,FROM_GRADE,ACTION,RECOM_STOCK
0,2012-07-18,287.41272,290.755188,286.989288,289.295654,3107900.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG
1,2012-07-19,291.975616,298.122559,291.905853,295.422668,9384300.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG
2,2012-07-20,303.243378,305.325562,297.973114,304.269531,12975800.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG
3,2012-07-23,299.118835,308.020477,298.007996,306.605774,7150000.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG
4,2012-07-24,306.351715,307.811249,301.041626,302.650604,4033800.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG


# Preprocessing

## Window Function

In [9]:
## Recommnedation start month and end month
df_1.loc[:,'RECOM_PERIOD_STRT_DT']=df_1.groupby(['STOCK','RECOM_DATE','RECOM_END_DATE','FIRM','TO_GRADE'])['DATE'].transform(lambda x:x.min())
df_1.loc[:,'RECOM_PERIOD_END_DT']=df_1.groupby(['STOCK','RECOM_DATE','RECOM_END_DATE','FIRM','TO_GRADE'])['DATE'].transform(lambda x:x.max())


In [10]:
df_1

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOLUME,SE,STOCK,REFRESH_DATE,RECOM_DATE,RECOM_END_DATE,FIRM,TO_GRADE,FROM_GRADE,ACTION,RECOM_STOCK,RECOM_PERIOD_STRT_DT,RECOM_PERIOD_END_DT
0,2012-07-18,287.412720,290.755188,286.989288,289.295654,3107900.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG,2012-07-18,2012-08-17
1,2012-07-19,291.975616,298.122559,291.905853,295.422668,9384300.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG,2012-07-18,2012-08-17
2,2012-07-20,303.243378,305.325562,297.973114,304.269531,12975800.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG,2012-07-18,2012-08-17
3,2012-07-23,299.118835,308.020477,298.007996,306.605774,7150000.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG,2012-07-18,2012-08-17
4,2012-07-24,306.351715,307.811249,301.041626,302.650604,4033800.0,NASDAQ,GOOG,2021-08-15,2012-07-17 09:33:00,2012-08-17 09:33:00,,Outperform,,main,GOOG,2012-07-18,2012-08-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3633035,2019-05-16,28.850000,31.150000,28.450001,30.860001,17612000.0,NYSE,PINS,2021-08-15,2019-04-22 13:28:54,2019-05-22 13:28:54,Zephrin Group,Sell,,init,PINS,2019-04-23,2019-05-22
3633036,2019-05-17,25.870001,28.120001,25.700001,26.700001,26271800.0,NYSE,PINS,2021-08-15,2019-04-22 13:28:54,2019-05-22 13:28:54,Zephrin Group,Sell,,init,PINS,2019-04-23,2019-05-22
3633037,2019-05-20,27.240000,27.240000,25.709999,25.850000,7269200.0,NYSE,PINS,2021-08-15,2019-04-22 13:28:54,2019-05-22 13:28:54,Zephrin Group,Sell,,init,PINS,2019-04-23,2019-05-22
3633038,2019-05-21,26.299999,26.549999,25.299999,25.410000,6481700.0,NYSE,PINS,2021-08-15,2019-04-22 13:28:54,2019-05-22 13:28:54,Zephrin Group,Sell,,init,PINS,2019-04-23,2019-05-22


## Aggregate

In [11]:
## 
df_summary=df_1.groupby(['STOCK','RECOM_DATE','RECOM_END_DATE','FIRM','TO_GRADE']
                       ).apply(lambda x: pd.Series({'RECOM_PERIOD_STRT_DT':x.DATE.min(),
                                                    'RECOM_PERIOD_END_DT':x.DATE.max(),
                                                    'START_PRICE':max(np.where(x.RECOM_PERIOD_STRT_DT==x.DATE,x.CLOSE,0)),
                                                    'END_PRICE':max(np.where(x.RECOM_PERIOD_END_DT==x.DATE,x.CLOSE,0))
                                                     })).reset_index(drop=False)
df_summary.shape
## parallel processing
# https://stackoverflow.com/questions/26187759/parallelize-apply-after-pandas-groupby

In [22]:
# df_summary.loc[(df_summary.STOCK=='CRM')&(df_summary.FIRM=='Atlantic Equities'),:]

Unnamed: 0,STOCK,RECOM_DATE,RECOM_END_DATE,FIRM,TO_GRADE,RECOM_PERIOD_STRT_DT,RECOM_PERIOD_END_DT,START_PRICE,END_PRICE
41592,CRM,2013-11-04 07:35:20,2013-12-04 07:35:20,Atlantic Equities,Overweight,2013-11-05,2013-12-04,55.689999,52.27
41773,CRM,2019-01-09 13:39:18,2019-02-09 13:39:18,Atlantic Equities,Overweight,2019-01-10,2019-02-08,147.039993,156.669998


In [26]:
df_summary.loc[:,'GROWTH_RATE']=(df_summary.END_PRICE-df_summary.START_PRICE)/df_summary.START_PRICE
df_summary.head(5)

Unnamed: 0,STOCK,RECOM_DATE,RECOM_END_DATE,FIRM,TO_GRADE,RECOM_PERIOD_STRT_DT,RECOM_PERIOD_END_DT,START_PRICE,END_PRICE,GROWTH_RATE
0,A,2012-02-16,2012-03-16,Deutsche Bank,Buy,2012-02-16,2012-03-16,28.688797,29.860443,0.04084
1,A,2012-03-06,2012-04-06,Morgan Stanley,Overweight,2012-03-06,2012-04-05,27.674252,29.370132,0.06128
2,A,2012-04-19,2012-05-19,Stifel Nicolaus,Buy,2012-04-19,2012-05-18,27.500467,25.407755,-0.076097
3,A,2012-05-15,2012-06-15,Morgan Stanley,Overweight,2012-05-15,2012-06-15,26.575478,26.247463,-0.012343
4,A,2012-05-15,2012-06-15,Stifel Nicolaus,Buy,2012-05-15,2012-06-15,26.575478,26.247463,-0.012343


### Loading 

In [None]:
#df_summary=pd.read_sql("""SELECT * FROM STOCK_PRED.STOCK_RECOMMENDS_ANALYSIS_AGG""",con=engine)
#print(df_summary.shape)

# Analysis

## Overview 

In [33]:
df_summary.groupby('TO_GRADE').GROWTH_RATE.describe().reset_index(drop=False).loc[lambda x:x['count']>100,:].sort_values(by='count',ascending=False)


Unnamed: 0,TO_GRADE,count,mean,std,min,25%,50%,75%,max
7,Buy,47228.0,0.020186,0.148896,-0.857305,-0.044564,0.013829,0.071658,4.80292
27,Neutral,28832.0,0.026373,1.23358,-0.793866,-0.039856,0.01338,0.066493,207.81772
29,Outperform,22802.0,0.017357,0.135182,-0.848404,-0.043079,0.013645,0.070286,3.938557
31,Overweight,21292.0,0.021846,0.128754,-0.758903,-0.039161,0.016452,0.073957,1.652968
15,Hold,14437.0,0.017097,0.130365,-0.809561,-0.038611,0.013114,0.065223,3.197183
10,Equal-Weight,11891.0,0.026038,0.1328,-0.813608,-0.036151,0.018094,0.07818,2.263352
22,Market Perform,6976.0,0.049141,2.662928,-0.775025,-0.036853,0.011902,0.062999,222.155105
53,Underweight,3928.0,0.027734,0.145003,-0.655517,-0.03788,0.017583,0.081338,2.614767
51,Underperform,3631.0,0.015662,0.137996,-0.854631,-0.045525,0.011151,0.070372,1.774986
43,Sell,2993.0,0.0171,0.137732,-0.82,-0.046501,0.01433,0.072685,1.201991


In [None]:
## ANOVA testing