In [1]:
import pandas as pd
import numpy as np
import os
import time
from random import randint
import re
import time
import requests

## Acquire Data: Get financial data for S&P 500 stocks for the past 10 years.

In [2]:
sp500 = pd.read_excel('./S&P500_Stocks.xlsx')
tickers = sp500.Ticker.tolist()
#print(tickers)

**Download the Financial data from http://stockrow.com**

In [None]:
for stock in tickers[0]: 
    
    # Get the data from https://stockrow.com
    #print (stock, ' starting download')
    df_income = pd.read_excel('https://stockrow.com/api/companies/'+str(stock)+'/financials.xlsx?dimension=MRQ&section=Income%20Statement')
    df_balanceSheet = pd.read_excel('https://stockrow.com/api/companies/'+str(stock)+'/financials.xlsx?dimension=MRQ&section=Balance%20Sheet')
    df_cash = pd.read_excel('https://stockrow.com/api/companies/'+str(stock)+'/financials.xlsx?dimension=MRQ&section=Cash%20Flow')
    df_metrics = pd.read_excel('https://stockrow.com/api/companies/'+str(stock)+'/financials.xlsx?dimension=MRQ&section=Metrics')
    df_growth = pd.read_excel('https://stockrow.com/api/companies/'+str(stock)+'/financials.xlsx?dimension=MRQ&section=Growth')
    #print (stock, ' downloaded.')
    
    # Write to Excel
    fname = str(stock) + ".xlsx"
    writer = pd.ExcelWriter(fname)
    
    df_income.to_excel(writer, sheet_name ="income")
    df_balanceSheet.to_excel(writer, sheet_name ="balance_sheet")
    df_cash.to_excel(writer, sheet_name ="cash_flow")
    df_metrics.to_excel(writer, sheet_name ="metrics")
    df_growth.to_excel(writer, sheet_name ="growth")
    #print (stock, ' saved.')
    
    # Prevent DDOS
    random_time = randint(5,15)
    #print ('waiting, ', random_time,'secs')
    time.sleep(random_time)

    
print('~~~Download Complete~~~')

**Create consolidated lists of Income Sheet, Balance Sheet, cash flow, metrics, growth**

In [None]:
files = os.listdir('./Data') #directory with the stock files

# Consolidated 'MasterList' DataFrames for each category
income_df = pd.DataFrame() 
balanceSheet_df = pd.DataFrame() 
cashFlow_df = pd.DataFrame() 
metrics_df = pd.DataFrame() 
growth_df = pd.DataFrame() 


datasets = [income_df, balanceSheet_df, cashFlow_df, metrics_df, growth_df]
sheet_names = ['income', 'balance_sheet', 'cash_flow', 'metrics', 'growth' ]

for f in files:
    
    ticker = f.split('.')[0] # get stock ticker from filename (eg. AAPL.xlsx)
    filepath = os.getcwd() + '\\Data\\'+f # file path for each stock file
    
    # For each file (eg. AAPL.xlsx), iterate through its tabs (eg. income, balance_sheet etc) and save into the 'Master List'
    for i in range (5):
        temp_df= pd.read_excel(filepath, sheetname = sheet_names[i]).transpose()
        temp_df['Ticker'] = ticker
        datasets[i] = pd.concat([datasets[i], temp_df])# Append stock data into Master List

        
# Assign values into 
income_df = datasets[0]
balanceSheet_df =  datasets[1]
cashFlow_df =  datasets[2]
metrics_df =  datasets[3]
growth_df =  datasets[4]

**Get historical prices for S&P500 stocks and the S&P index**

Define helper function that will scrape Yahoo Finance for the historical stock data, and return the results in a DataFrame

In [None]:
def get_yahoo_ticker_data(ticker):
    '''
    This method scraps Historical stock data from  Yahoo Finance.
    This method takes in stock ticker as input (eg. 'AAPL, 'AMZN') and ouputs a DataFrame with
    histrorical stock prices from Jan-04-2008 to Jan-31-2018. 
    Source Credit for scraping Yahoo Finance: 
    https://github.com/bradlucas/get-yahoo-quotes-python/blob/master/get-yahoo-quotes.py

    '''
    # Scrape Yahoo Fiance
    #---------------------------------------------------
    res = requests.get('https://finance.yahoo.com/quote/' + ticker + '/history')
    yahoo_cookie = res.cookies['B']
    yahoo_crumb = None
    pattern = re.compile('.*"CrumbStore":\{"crumb":"(?P<crumb>[^"]+)"\}')
    for line in res.text.splitlines():
        m = pattern.match(line)
        if m is not None:
            yahoo_crumb = m.groupdict()['crumb']
    cookie_tuple = yahoo_cookie, yahoo_crumb

    current_date = int(time.time()) # Can set your own custom end time.
    url_kwargs = {'symbol': ticker, 'timestamp_end': current_date,
        'crumb': cookie_tuple[1]}
    #set period1= 0 for Max History, or enter custom unix date
    #set interval=1d for daily updates, =1wk for weekly updates, =1mo for monthly updates
    url_price = 'https://query1.finance.yahoo.com/v7/finance/download/' \
                '{symbol}?period1=1199163600&period2={timestamp_end}&interval=1d&events=history' \
                '&crumb={crumb}'.format(**url_kwargs)

    time.sleep(1)
    response = requests.get(url_price, cookies={'B': cookie_tuple[0]}) #webpage
    
    # Create pandas data frame from the downloaded page
    #---------------------------------------------------
    s = response.content
    s = s.decode('utf-8')
    s = s.replace('Adj Close', 'Adj_Close')
    s_rows = s.split('\n')
    s_rows_cols = [each.split() for each in s_rows]
    #print (s_rows_cols[:2])
    header_row = ['Date','Open','High','Low','Close','Adj_Close','Volume']
    df = pd.DataFrame (s_rows_cols[1:])
    df = pd.concat([df[0].str.split(',', expand=True)], axis=1)
    df.columns = header_row
    
    return df

Create `stock_price` DataFrame, which is a consolidated dataset of the historical stock prices for S&P500 stocks (From: Jan 1, 2008 to Jan 31, 2018)

In [None]:
missed_stocks=[] # Capture any missed stocks due to error while scrpaing Yahoo Finance
stock_prices = pd.DataFrame({'Tickers':[]})

for stock in tickers[0:10]:
    print ('Stock:', stock)
    try: #if we can successfully get the stock price data
        current_stock = get_yahoo_ticker_data(stock)
        current_stock['Ticker'] = stock
        stock_prices = pd.concat([stock_prices, current_stock])
          
    except Exception as ex:# generate list of stocks for which we were unable to obtain stock data
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print (message)
       
        missed_stocks.append(stock)
    #time.sleep(1)

print (missed_stocks) # Note: re-run the above script with these missed stocks as required

Add the S&P 500 index (ticker: ^GSPC)

In [None]:
try: #if we can successfully get the stock price data
    sp500 = get_yahoo_ticker_data('^GSPC')
    sp500['Ticker'] = 'S&P500'
    stock_prices = pd.concat([stock_prices, sp500])
    
except Exception as ex:# generate list of stocks for which we were unable to obtain stock data
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print (message)

*(Optional) Save to excel and to pickle*

In [None]:
income_df.to_excel('income.xlsx')
balanceSheet_df.to_excel('balanceSheet.xlsx')
cashFlow_df.to_excel('cashFlow.xlsx')
metrics_df.to_excel('metrics.xlsx')
growth_df.to_excel('growth.xlsx')
#growth_df.to_excel('stock_prices_v5.xlsx') # large file 100+MB


income_df.to_pickle('income.pkl')
balanceSheet_df.to_pickle('balanceSheet.pkl')
cashFlow_df.to_pickle('cashFlow.pkl')
metrics_df.to_pickle('metrics.pkl')
growth_df.to_pickle('growth.pkl')
stock_prices.to_pickle('stock_prices_v5.pkl')

---------
## Data Wrangling: Building the Dataset for Analysis

To create our stock buy/sell prediction model, we will look at the following financial metrics (features):
https://www.investopedia.com/articles/fundamental-analysis/09/five-must-have-metrics-value-investors.asp
1. `Price-to-Earnings Ratio (P/E)` : calculated by Market Price per share / Earnings per Share (annual)
2. `Price-to-Book Ratio (P/B)` : calculated by Market Price per share / Book Value per Share  
3. `Debt-Equity Ratio (D/E)` : available in the metrics dataframe
4. `Free Cash Flow` : available in the cashFlow dataframe
5. `Price/Earnings to Growth Ratio (PEG)` : calculate by P/E ratio / EPS growth in the period
    
    

*(Optional) Read dataframes from pickle*

In [73]:
income_df = pd.read_pickle('income.pkl')
balanceSheet_df = pd.read_pickle('balanceSheet.pkl')
cashFlow_df = pd.read_pickle('cashFlow.pkl')
metrics_df = pd.read_pickle('metrics.pkl')
growth_df= pd.read_pickle('growth.pkl')
stock_prices = pd.read_pickle('stock_prices.pkl')

Set the name for index. This will be useful later on when we reset index to extract date as a feature.

In [74]:
income_df.index.name = 'Date'
balanceSheet_df.index.name = 'Date'
cashFlow_df.index.name = 'Date'
metrics_df.index.name = 'Date'
growth_df.index.name = 'Date'

A quick examination of the Dtypes of the index reveals that there are some non-date index in the `income_df`, `metrics_df`, and `growth_df`. There are few entries with dates such as, "Unnamed: 19" or "Unnamed: 20", which can be removed.

In [75]:
print ('Dtype of income_df index: ', income_df.index.dtype)
print ('Dtype of balanceSheet_df index: ', balanceSheet_df.index.dtype)
print ('Dtype of cashFlow_df index: ', cashFlow_df.index.dtype)
print ('Dtype of metrics_df index: ', metrics_df.index.dtype)
print ('Dtype of growth_df index: ', growth_df.index.dtype)

Dtype of income_df index:  object
Dtype of balanceSheet_df index:  datetime64[ns]
Dtype of cashFlow_df index:  datetime64[ns]
Dtype of metrics_df index:  object
Dtype of growth_df index:  object


In [76]:
# Optional Code: Run this code to see the number of rows that have 'Unnamed' index. Change the name of the dataframe 
counter = 0
for i in range (len(growth_df.index)): # iterate through the dataframe index
    # if the index type is a String, then it is not a date
    if type(growth_df.index[i]) is str:
        print ('i: ', i, 'index: ', growth_df.index[i]) # debug print
        counter += 1
print ('Total # of rows with "unnamed" index: ', counter)

i:  741 index:  Unnamed: 15
i:  1108 index:  Unnamed: 15
i:  1835 index:  Unnamed: 23
i:  2537 index:  Unnamed: 1
i:  3194 index:  Unnamed: 31
i:  3627 index:  Unnamed: 29
i:  4350 index:  Unnamed: 17
i:  4351 index:  Unnamed: 18
i:  4479 index:  Unnamed: 9
i:  4986 index:  Unnamed: 36
i:  5653 index:  Unnamed: 2
i:  5654 index:  Unnamed: 3
i:  5655 index:  Unnamed: 4
i:  6710 index:  Unnamed: 23
i:  7286 index:  Unnamed: 2
i:  7287 index:  Unnamed: 3
i:  7411 index:  Unnamed: 38
i:  7675 index:  Unnamed: 38
i:  8055 index:  Unnamed: 27
i:  8294 index:  Unnamed: 15
i:  8295 index:  Unnamed: 16
i:  8460 index:  Unnamed: 8
i:  8829 index:  Unnamed: 15
i:  8992 index:  Unnamed: 6
i:  8993 index:  Unnamed: 7
i:  9481 index:  Unnamed: 38
i:  10009 index:  Unnamed: 18
i:  10010 index:  Unnamed: 19
i:  10744 index:  Unnamed: 25
i:  10745 index:  Unnamed: 26
i:  12078 index:  Unnamed: 15
i:  12137 index:  Unnamed: 19
i:  12439 index:  Unnamed: 27
i:  12440 index:  Unnamed: 28
i:  12809 index: 

Change index to type DateTime. Passing in the argument `errors='coerce'` will change all non-date values (eg. 'Unnamed: 20') to 'NaT'

In [77]:
# Change index to type DateTime. Add in errors='coerce' will change all non-date values to 'NaT'
income_df.index = pd.to_datetime(income_df.index, errors = 'coerce') 
metrics_df.index = pd.to_datetime(metrics_df.index, errors = 'coerce')
growth_df.index = pd.to_datetime(growth_df.index, errors = 'coerce')

# Drop rows that have 'NaT' type values
income_df = income_df[pd.isnull(income_df.index) == False]
metrics_df = metrics_df[pd.isnull(metrics_df.index) == False]
growth_df = growth_df[pd.isnull(growth_df.index) == False]

Reset index to bring date as a Feature column

In [78]:
income_df.reset_index(inplace = True)
balanceSheet_df.reset_index(inplace = True)
cashFlow_df.reset_index(inplace = True)
metrics_df.reset_index(inplace = True)
growth_df.reset_index(inplace = True)

Review shape of DataFrames

In [79]:
# Review shape of DataFrames
print (income_df.shape)
print (balanceSheet_df.shape)
print (cashFlow_df.shape)
print (metrics_df.shape)
print (growth_df.shape)

(18771, 31)
(18679, 28)
(18752, 16)
(18649, 23)
(18585, 15)


Let's create a consolidate DataFrame, called `all_df`, that combines the data from of the other 5 datasets. Later on, we will also port in the S&P500 price info

In [80]:
#Merge the 5 datasets
datasets = [income_df, balanceSheet_df, cashFlow_df, metrics_df, growth_df]
all_df = datasets[0].copy()
for df in datasets[1:]:
    all_df = pd.merge (all_df, df, on=['Date','Ticker'], how = 'outer')

# replace spaces in column names with '_'
all_df.columns = all_df.columns.str.replace(' ','_') 

print(all_df.shape)
all_df.head()

(18817, 105)


Unnamed: 0,Date,Consolidated_Income,Cost_of_Revenue,Dividends_per_Basic_Common_Share,EBIT,EBIT_Margin,EBITDA,EBITDA_Margin,EPS,EPS_Diluted,...,Dividends_per_Basic_Common_Share_Growth,EBIT_Growth,EPS_Diluted_Growth,EPS_Growth,Gross_Profit_Growth,Inventory_Growth,Net_Income_Growth,Operating_Cash_Flow_Growth,Trade_and_Non-Trade_Receivables_Growth,Weighted_Average_Shares_Diluted_Growth
0,2017-10-31,177000000.0,542000000.0,0.13,246000000.0,0.2069,298000000.0,0.251,0.55,0.54,...,0.0833,0.4471,0.4211,0.4103,0.1003,0.0788,0.4048,0.2308,0.1474,0.0
1,2017-07-31,175000000.0,518000000.0,0.13,212000000.0,0.1903,263000000.0,0.236,0.55,0.54,...,0.0833,0.404,0.4211,0.4474,0.0996,0.0424,0.4113,0.1753,0.1492,-0.0061
2,2017-04-30,164000000.0,510000000.0,0.13,211000000.0,0.1915,265000000.0,0.24,0.51,0.5,...,0.0833,0.563,0.7857,0.8214,0.117,-0.0126,0.8022,0.0118,0.1246,-0.0091
3,2017-01-31,168000000.0,493000000.0,0.13,213000000.0,0.1996,268000000.0,0.251,0.52,0.52,...,0.0833,0.3313,0.4444,0.4054,0.0689,-0.0054,0.3884,0.045,0.0583,-0.0181
4,2016-10-31,126000000.0,523000000.0,0.12,170000000.0,0.153,226000000.0,0.203,0.39,0.38,...,0.2,0.0692,-0.0952,-0.0714,0.0991,-0.0148,-0.1,-0.029,0.0413,-0.009


Lets add in the S&P 500 stock price data. 

We will have to pay particular attention to the `date` while doing the lookup. The financial datasets (now combined in `all_df`) have dates that fall on the weekend. We will have to correct the dates to reflect the previous business day before doing the merge.

In [81]:
# Convert the Date in stock_prices to Datetime
stock_prices.Date = pd.to_datetime(stock_prices.Date, errors = 'coerce')
stock_prices.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1229684 entries, 0 to 2540
Data columns (total 8 columns):
Adj_Close    1229178 non-null object
Close        1229178 non-null object
Date         1229178 non-null datetime64[ns]
High         1229178 non-null object
Low          1229178 non-null object
Open         1229178 non-null object
Ticker       1229684 non-null object
Volume       1229178 non-null object
dtypes: datetime64[ns](1), object(7)
memory usage: 84.4+ MB


We will replace the Date column with a modified version that accounts for Weekends and Holidays.

In [82]:
#Create custom calendar of US Federal Holidays that NYSE observes

from datetime import datetime
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, nearest_workday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, USMemorialDay, \
    USLaborDay, USThanksgivingDay
from pandas.tseries.offsets import BDay

class USTradingCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday('NewYearsDay', month=1, day=1, observance=nearest_workday),
        USMartinLutherKingJr,
        USPresidentsDay,
        GoodFriday,
        USMemorialDay,
        Holiday('USIndependenceDay', month=7, day=4, observance=nearest_workday),
        USLaborDay,
        USThanksgivingDay,
        Holiday('Christmas', month=12, day=25, observance=nearest_workday)
    ]

cal = USTradingCalendar()
NYSE_closures = cal.holidays(datetime(2008, 1, 1), datetime(2018, 1, 31)) #list of all NYSE closures

In [83]:
# Create new Date column
all_df.rename(columns={'Date':'Date_old'}, inplace = True)

#Adjust for Weekends
all_df['Date'] = all_df['Date_old'].apply(lambda row: row if row.dayofweek in (0,1,2,3,4) else row-BDay(1))
#Adjust for US Federal Holidays
all_df['Date'] = all_df['Date'].apply(lambda row: row-BDay(1) if row in NYSE_closures else row)

# Review our changes
all_df[['Date_old','Date']].head()

Unnamed: 0,Date_old,Date
0,2017-10-31,2017-10-31
1,2017-07-31,2017-07-31
2,2017-04-30,2017-04-28
3,2017-01-31,2017-01-31
4,2016-10-31,2016-10-31
5,2016-07-31,2016-07-29
6,2016-04-30,2016-04-29
7,2016-01-31,2016-01-29
8,2015-10-31,2015-10-30
9,2015-07-31,2015-07-31


Notice that in the 3rd entry, the formula changed the date 

    From: 2017-04-30 (Sun)    
    To:   2017-04-28 (Fri)

Next, let's merge the S&P 500 data from `stock_prices` to `all_df`. We will also add a column to `all_df`

In [84]:
all_df = pd.merge(all_df, stock_prices, on=['Ticker','Date'], how ='left')
print(all_df.shape)

(18817, 112)


Finally, lets add a `SP500` column that has the S&P500 index prices for each date

In [85]:
#Subset the stock prices DataFrame for the dates in all_df (ie. quarterly financials dates)
sp500 = stock_prices.loc[(stock_prices.Ticker == 'S&P500'),['Date','Close']]
sp500.rename(columns = {'Close':'SP500'}, inplace = True)

# Merge the data
all_df = pd.merge(all_df, sp500, on=['Date'], how ='left')
print(all_df.shape)

(18817, 113)


In [86]:
# Columns
print(all_df.columns.values)

['Date_old' 'Consolidated_Income' 'Cost_of_Revenue'
 'Dividends_per_Basic_Common_Share' 'EBIT' 'EBIT_Margin' 'EBITDA'
 'EBITDA_Margin' 'EPS' 'EPS_Diluted' 'Earnings_before_Tax'
 'Free_Cash_Flow_Margin' 'Gross_Margin' 'Gross_Profit' 'Income_Tax_Expense'
 'Interest_Expense' 'Net_Income' 'Net_Income_Common_Stock'
 'Net_Income_from_Discontinued_Operations'
 'Net_Income_to_Non-Controlling_Interests' 'Operating_Expenses'
 'Operating_Income' 'Preferred_Dividends_Income_Statement_Impact'
 'Profit_Margin' 'Research_and_Development_Expense' 'Revenue_Growth'
 'Revenues' 'Selling,_General_and_Administrative_Expense' 'Ticker'
 'Weighted_Average_Shares' 'Weighted_Average_Shares_Diluted'
 'Accumulated_Other_Comprehensive_Income'
 'Accumulated_Retained_Earnings_(Deficit)' 'Assets_Non-Current'
 'Cash_and_Equivalents' 'Cash_and_Short_Term_Investments' 'Current_Assets'
 'Current_Liabilities' 'Debt_Current' 'Debt_Non-Current' 'Deferred_Revenue'
 'Deposit_Liabilities' 'Goodwill_and_Intangible_Assets' 'Inve

In [87]:
all_df.head()

Unnamed: 0,Date_old,Consolidated_Income,Cost_of_Revenue,Dividends_per_Basic_Common_Share,EBIT,EBIT_Margin,EBITDA,EBITDA_Margin,EPS,EPS_Diluted,...,Trade_and_Non-Trade_Receivables_Growth,Weighted_Average_Shares_Diluted_Growth,Date,Adj_Close,Close,High,Low,Open,Volume,SP500
0,2017-10-31,177000000.0,542000000.0,0.13,246000000.0,0.2069,298000000.0,0.251,0.55,0.54,...,0.1474,0.0,2017-10-31,67.879715,68.029999,68.169998,67.540001,67.739998,1614400,2575.26001
1,2017-07-31,175000000.0,518000000.0,0.13,212000000.0,0.1903,263000000.0,0.236,0.55,0.54,...,0.1492,-0.0061,2017-07-31,59.535259,59.790001,60.619999,59.759998,60.32,1038700,2470.300049
2,2017-04-30,164000000.0,510000000.0,0.13,211000000.0,0.1915,265000000.0,0.24,0.51,0.5,...,0.1246,-0.0091,2017-04-28,54.693642,55.049999,55.349998,54.84,55.310001,3076600,2384.199951
3,2017-01-31,168000000.0,493000000.0,0.13,213000000.0,0.1996,268000000.0,0.251,0.52,0.52,...,0.0583,-0.0181,2017-01-31,48.532555,48.970001,49.150002,48.009998,48.040001,2552600,2278.870117
4,2016-10-31,126000000.0,523000000.0,0.12,170000000.0,0.153,226000000.0,0.203,0.39,0.38,...,0.0413,-0.009,2016-10-31,43.056118,43.57,43.700001,43.18,43.330002,1556800,2126.149902


*(Optional) Save all our changes to pickle*

In [88]:
income_df.to_pickle('income_v2.pkl')
balanceSheet_df.to_pickle('balanceSheet_v2.pkl')
cashFlow_df.to_pickle('cashFlow_v2.pkl')
metrics_df.to_pickle('metrics_v2.pkl')
growth_df.to_pickle('growth_v2.pkl')
stock_prices.to_pickle('stock_prices_v2.pkl')
all_df.to_pickle('all_df_v2.pkl')

-------------
## Build Model

Let's build a `ML_df` which has the following features

Features:
- Ticker
- Sector
- Industry
- Date 
- Stock Price close 
- %Change Stock 
- SP500 Price 
- %Change SP500 
- 5 ratios (listed above)
    1. `Price-to-Earnings Ratio (P/E)` : calculated by Market Price per share / Earnings per Share (annual)
    2. `Price-to-Book Ratio (P/B)` : calculated by Market Price per share / Book Value per Share  
    3. `Debt-Equity Ratio (D/E)` : available in the metrics dataframe
    4. `Free Cash Flow` : available in the cashFlow dataframe
    5. `Price/Earnings to Growth Ratio (PEG)` : calculate by P/E ratio / EPS growth in the period


Response:
- Buy/Sell label: based on whether the stock price outperformed/underperfomed the S&P 500 index in the quarter

*(Optional) Read dataframes from pickle*

In [89]:
income_df = pd.read_pickle('income_v2.pkl')
balanceSheet_df = pd.read_pickle('balanceSheet_v2.pkl')
cashFlow_df = pd.read_pickle('cashFlow_v2.pkl')
metrics_df =pd.read_pickle('metrics_v2.pkl')
growth_df = pd.read_pickle('growth_v2.pkl')
stock_prices = pd.read_pickle('stock_prices_v2.pkl')
all_df.to_pickle('all_df_v2.pkl')

In [90]:
ML_df = all_df[['Ticker','Date','Close','SP500','EPS','Book_Value_per_Share','Debt_to_Equity_Ratio','Free_Cash_Flow']]
ML_df.head()

Unnamed: 0,Ticker,Date,Close,SP500,EPS,Book_Value_per_Share,Debt_to_Equity_Ratio,Free_Cash_Flow
0,A,2017-10-31,68.029999,2575.26001,0.55,15.003,0.4163,230000000.0
1,A,2017-07-31,59.790001,2470.300049,0.55,14.355,0.4516,185000000.0
2,A,2017-04-28,55.049999,2384.199951,0.51,13.62,0.4673,214000000.0
3,A,2017-01-31,48.970001,2278.870117,0.52,13.345,0.4636,84000000.0
4,A,2016-10-31,43.57,2126.149902,0.39,13.015,0.4487,182000000.0


In [91]:
ML_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18817 entries, 0 to 18816
Data columns (total 8 columns):
Ticker                  18817 non-null object
Date                    18817 non-null datetime64[ns]
Close                   18522 non-null object
SP500                   18817 non-null object
EPS                     18717 non-null float64
Book_Value_per_Share    18649 non-null float64
Debt_to_Equity_Ratio    18649 non-null float64
Free_Cash_Flow          18752 non-null float64
dtypes: datetime64[ns](1), float64(4), object(3)
memory usage: 1.3+ MB


In [93]:
ML_df[ML_df.Close.isnull()]

Unnamed: 0,Ticker,Date,Close,SP500,EPS,Book_Value_per_Share,Debt_to_Equity_Ratio,Free_Cash_Flow
176,ABBV,2012-12-31,,1426.189941,0.98,2.133,4.6601,8.460000e+08
177,ABBV,2012-09-28,,1440.670044,1.01,,,2.267000e+09
178,ABBV,2012-06-29,,1362.160034,0.80,,,1.467000e+09
179,ABBV,2012-03-30,,1408.469971,0.56,,,1.432000e+09
1116,ALLE,2013-09-30,,1681.550049,-0.81,,,9.510000e+07
1117,ALLE,2013-06-28,,1606.280029,0.63,,,5.540000e+07
1118,ALLE,2013-03-28,,1569.189941,0.41,,,-4.900000e+06
1846,APTV,2011-09-30,,1131.420044,0.79,,,2.480000e+08
1847,APTV,2011-06-30,,1320.640015,0.88,,,2.320000e+08
1848,APTV,2011-03-31,,1325.829956,0.42,,,-2.500000e+07
