In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import sqlalchemy
pd.options.display.max_rows=999

### Add `reported_binary` Column

In [2]:
fcc = pd.read_csv("fcc_values.csv", parse_dates=["report_date"])
fcc = fcc.sort_values(["ticker", "report_date"])

In [3]:
def reported_binary(x):
    if x["reported"] > x["consensus"]:
        return 1
    elif x["reported"] < x["consensus"]:
        return -1
    elif x["reported"] == x["consensus"]:
        return 0
    else:
        return None

In [4]:
fcc["reported_binary"] = fcc.apply(lambda x: reported_binary(x), axis=1)

### Get Overtime Consensus

In [5]:
# Create database connection
database = 'archive'
username = 'quant'
password = '8ad7a2be0e3ec3fbc3a16fab306f08d8'
host     = '159.65.252.27'
connection_string = 'postgresql://' + username + ':' + password + '@' + host + '/' + database
engine   = sqlalchemy.create_engine(connection_string)

In [6]:
# find the minimum insert_date
query = """select min(insert_date) as start_date from scraping.raw_cm"""
start_date = pd.read_sql_query(query, engine)
start_date = pd.to_datetime(start_date["start_date"].values[0])

In [7]:
# get the data after we start scraping
uncovered_data = fcc[fcc["report_date"] < start_date]
covered_data = fcc[(fcc["report_date"] >= start_date) & (fcc["report_date"] <= pd.Timestamp.today())]
tickers = list(covered_data.ticker.unique())

In [8]:
# get the target_name_map
query = """select * from configs.target_name_map where ticker in """ + str(tickers).replace("[", "(").replace("]", ")")
target_name_map = pd.read_sql_query(query, engine)
target_name_map = target_name_map[["ticker", "cm_target_name", "cm_segment_name"]]

In [9]:
# get the fiscal year and quarter
query = """
select ticker, year, quarter, report_date 
from configs.quarter_name_map 
where insert_date = '2019-03-07'
and ticker in """ + str(tickers).replace("[", "(").replace("]", ")") + """
and report_date::date > '""" + str(start_date.date()) + "' order by 1, 2"
quarter_name_map = pd.read_sql_query(query, engine)

In [10]:
# merge covered_data with target_name_map and quarter_name_map
data = covered_data.merge(target_name_map, how='left').merge(quarter_name_map, how='left')
data["start_date"] = data.groupby("ticker")["report_date"].shift(1) + dt.timedelta(days=1)
data["stop_date"] = data["report_date"]

def update_start_date(x):
    if (pd.isnull(x["start_date"])) and (x["stop_date"] > start_date):
        return start_date
    else:
        return x["start_date"]

data["updated_start_date"] = data.apply(lambda x: update_start_date(x), axis=1)
del data["start_date"]

In [11]:
# query the data base and append the data
consensus_overtime = pd.DataFrame()
for i in range(len(data)):
    data_i = data.iloc[i]
    if pd.isnull(data_i["updated_start_date"]):
        continue
    else:
        query = """
         select * from scraping.raw_cm 
         where insert_date::date >= '""" + str(data_i["updated_start_date"].date()) + """' 
         and insert_date::date <= '""" + str(data_i["stop_date"].date()) + """'
         and ticker = '""" + data_i["ticker"] + """' 
         and metric_name = '""" + data_i["cm_target_name"] + """' 
         and segment_name = '""" + data_i["cm_segment_name"] + """' 
         and year = """ + str(int(data_i["year"])) + """ 
         and quarter = """ + str(int(data_i["quarter"])) + """ 
         order by 1"""
        try:
            consensus_i = pd.read_sql_query(query, engine)
            consensus_i.rename(columns={"reported": "consensus"}, inplace=True)
            consensus_i["report_date"] = data_i["report_date"]
            consensus_i["reported"] = data_i["reported"]
            consensus_i["reported_binary"] = data_i["reported_binary"]
            consensus_overtime = consensus_overtime.append(consensus_i)
        except:
            pass

In [12]:
consensus_overtime.rename(columns={"insert_date": "consensus_date"}, inplace=True)
consensus_overtime = consensus_overtime[["ticker", "report_date", "consensus_date", 
                                         "consensus", "reported", "reported_binary"]]

In [14]:
uncovered_data["consensus_date"] = np.nan
final_data = uncovered_data.append(consensus_overtime).reset_index(drop=True)
final_data = final_data[["ticker", "report_date", "consensus_date", 
                         "consensus", "reported", "reported_binary"]].sort_values(["ticker", "report_date"]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [16]:
final_data.to_csv("fcc_with_consensus_overtime.csv", index=False)