# DataProcessing

DataMiner module provided us relatively clear datasets. Altough these datasets are from multiple websites and there needs to be done some additional clearing.

This module generates 4 new files (clear_timeseries.json, clear_financials.json, clear_overview.json, clear_time_series_monthly_adjusted.json) which are sufficient for data interpretation.

In [1]:
import json
import copy

from datetime import datetime

First we get historical constituents of indexes (SP500, NASDAQ, DJI).

In [5]:
with open('constituents.json', 'r') as f, \
    open('hist_constituents.json', 'r') as f_hist:
        constituents = json.load(f)
        hist_changes = json.load(f_hist)

def remove_blacklist_symbols(timeseries, blacklist):
    for year, symbols in timeseries.items():
        for symbol in symbols:
            if symbol in blacklist:
                timeseries[year].remove(symbol)
    return timeseries
        
def get_historical_constituents(index, constituents, hist_changes):
    for i in constituents:
        if i['symbol'] == index:
            constit = i['constituents']
            
    for i in hist_changes:
        if i['symbol'] == index:
            changes = i['historicalConstituents']
    
    timeseries = {}
    # Maps symbols which for some reason are or are not in index even though they should be
    blacklist = []
    
    timeseries[datetime.strptime(changes[0]['date'], '%Y-%m-%d').year] = constit
    
    for change in changes:
        year = datetime.strptime(change['date'], '%Y-%m-%d').year
        if year < 2000:
            break
            
        temp_year = year    
        default_val = None
        while default_val == None:
            default_val = copy.deepcopy(timeseries.get(temp_year))
            temp_year += 1
            
        timeseries[year] = timeseries.get(year, default_val)
        symbol_change = change['symbol']
        if change['action'] == 'add':
            if symbol_change not in timeseries[year]:
                blacklist.append(symbol_change)
            else:
                timeseries[year].remove(symbol_change)
        else:
            if symbol_change in timeseries[year]:
                blacklist.append(symbol_change)
            else:
                timeseries[year].append(symbol_change)
                
    remove_blacklist_symbols(timeseries, blacklist)
    return timeseries
    
timeseries_all_indexes = {"^GSPC": get_historical_constituents("^GSPC", constituents, hist_changes),
              "^NDX": get_historical_constituents("^NDX", constituents, hist_changes),
              "^DJI": get_historical_constituents("^DJI", constituents, hist_changes)}
print(timeseries_all_indexes["^NDX"][2020])

['XEL', 'CTAS', 'EXC', 'ORLY', 'DLTR', 'ANSS', 'CSX', 'XLNX', 'CMCSA', 'MCHP', 'INCY', 'NXPI', 'SIRI', 'PEP', 'MXIM', 'CDNS', 'MNST', 'PAYX', 'TMUS', 'NTES', 'SPLK', 'SNPS', 'ADI', 'BIDU', 'REGN', 'COST', 'WBA', 'PYPL', 'GILD', 'KHC', 'FOX', 'LULU', 'VRSN', 'INTC', 'TXN', 'GOOG', 'ADP', 'CDW', 'FB', 'ILMN', 'AMD', 'ISRG', 'FOXA', 'CTSH', 'ALGN', 'CHTR', 'LRCX', 'CERN', 'CSCO', 'NFLX', 'MU', 'INTU', 'ADSK', 'VRSK', 'SGEN', 'FISV', 'BIIB', 'KLAC', 'AMAT', 'MSFT', 'AAPL', 'BKNG', 'MELI', 'ASML', 'WDAY', 'EA', 'SWKS', 'CPRT', 'TCOM', 'ADBE', 'EBAY', 'CHKP', 'TSLA', 'AMGN', 'VRTX', 'SBUX', 'ALXN', 'AMZN', 'IDXX', 'MDLZ', 'MAR', 'QCOM', 'ATVI', 'JD', 'ROST', 'FAST', 'NVDA', 'AVGO', 'PCAR', 'GOOGL', 'BMRN', 'CTXS', 'EXPE', 'LBTYK', 'LBTYA', 'TTWO', 'ULTA', 'WDC', 'NTAP', 'CSGP', 'UAL', 'WLTW', 'AAL']


Now we will clear our data from symbols that could not be obtained via API.

In [37]:
with open('fail_financials.json', 'r') as f_fin, \
    open('fail_overview.json', 'r') as f_overview, \
    open('fail_time_series_monthly_adjusted.json', 'r') as f_monthly:
    blacklist = []
    blacklist.extend(json.load(f_fin))
    blacklist.extend(json.load(f_overview))
    blacklist.extend(json.load(f_monthly))
    
for timeseries in timeseries_all_indexes.values():
    remove_blacklist_symbols(timeseries, blacklist)
    
with open('clear_timeseries.json', 'w') as f:
    json.dump(timeseries_all_indexes, f)

After this we have clear historical timeseries of indexes constituents saved in clear_timeseries.json

Now we will modify financials.json, overview.json, time_series_monthly_adjusted.json so that company data are accessed by its ticker. Also in time_series_monthly_adjusted.json file we will only save adjusted close price and date

In [7]:
with open('financials.json', 'r') as f:
    financials = json.load(f)

clear_financials = {}
for company in financials:
    try:
        ticker = company['results'][0]['ticker']
        clear_financials[ticker] = company['results']
    except IndexError:
        pass
      
with open('clear_financials.json', 'w') as f:
    json.dump(clear_financials, f)

In [8]:
clear_financials['IBM'][0]

{'ticker': 'IBM',
 'period': 'Y',
 'calendarDate': '2019-12-31',
 'reportPeriod': '2019-12-31',
 'updated': '2020-04-28',
 'dateKey': '2020-02-25',
 'accumulatedOtherComprehensiveIncome': -28597000000,
 'assets': 152186000000,
 'assetsAverage': 146846000000,
 'assetsCurrent': 38420000000,
 'assetsNonCurrent': 113766000000,
 'assetTurnover': 0.525,
 'bookValuePerShare': 23.49,
 'capitalExpenditure': -2370000000,
 'cashAndEquivalents': 8313000000,
 'cashAndEquivalentsUSD': 8313000000,
 'costOfRevenue': 40659000000,
 'consolidatedIncome': 9431000000,
 'currentRatio': 1.019,
 'debtToEquityRatio': 6.295,
 'debt': 68158000000,
 'debtCurrent': 10177000000,
 'debtNonCurrent': 57981000000,
 'debtUSD': 68158000000,
 'deferredRevenue': 15877000000,
 'depreciationAmortizationAndAccretion': 6059000000,
 'deposits': 0,
 'dividendYield': 0.046,
 'dividendsPerBasicCommonShare': 6.48,
 'earningBeforeInterestTaxes': 11506000000,
 'earningsBeforeInterestTaxesDepreciationAmortization': 17565000000,
 'EBIT

In [24]:
with open('overview.json', 'r') as f:
    overview = json.load(f)

clear_overview = {}
for company in overview:
    try:
        ticker = company['Symbol']
        clear_overview[ticker] = company
    except KeyError:
        pass 
    
with open('clear_overview.json', 'w') as f:
    json.dump(clear_overview, f)

In [33]:
clear_overview['IBM']

{'Symbol': 'IBM',
 'AssetType': 'Common Stock',
 'Name': 'International Business Machines Corporation',
 'Description': "International Business Machines Corporation provides integrated solutions and services worldwide. Its Cloud & Cognitive Software segment offers software for vertical and domain-specific solutions in health, financial services, and Internet of Things (IoT), weather, and security software and services application areas; and customer information control system and storage, and analytics and integration software solutions to support client mission critical on-premise workloads in banking, airline, and retail industries. It also offers middleware and data platform software, including Red Hat that enables the operation of clients' hybrid multi-cloud environments; and Cloud Paks, WebSphere distributed, and analytics platform software, such as DB2 distributed, information integration, and enterprise content management, as well as IoT, Blockchain and AI/Watson platforms. The 

In [9]:
with open('time_series_monthly_adjusted.json', 'r') as f:
    time_series_monthly_adjusted = json.load(f)

clear_time_series_monthly_adjusted = {}
for company in time_series_monthly_adjusted:
    try:
        ticker = company['Meta Data']['2. Symbol']
        temp_dict = {}
        for date, price in company['Monthly Adjusted Time Series'].items():
            temp_dict[date] = price['5. adjusted close']
        clear_time_series_monthly_adjusted[ticker] = temp_dict
    except KeyError:
        pass 
    
with open('clear_time_series_monthly_adjusted.json', 'w') as f:
    json.dump(clear_time_series_monthly_adjusted, f)

In [13]:
clear_time_series_monthly_adjusted['IBM']['2020-12-31']

'125.8800'