In [1]:
import pandas as pd
import os

In [2]:
raw_data_path = os.path.join(
    os.path.abspath('../'),
    'data',
    'raw'
)

# S&P 500

In [3]:
df_sp500_raw = pd.read_excel(
    os.path.join(
        raw_data_path,
        'sp500_companies.xlsx'
    )
)

df_sp500_raw.head(6)['Symbol'].to_list()

['MSFT', 'AAPL', 'NVDA', 'GOOGL', 'GOOG', 'AMZN']

# NASDAQ Screener

In [4]:
df_nasdaq_screener_raw = pd.read_csv(
    os.path.join(
        raw_data_path,
        'nasdaq_screener.csv'
    )
)

df_nasdaq_screener_filter = df_nasdaq_screener_raw[['Symbol', 'Sector', 'Industry']]

# Stock Market

In [5]:
df_stock_market_raw = pd.read_csv(
    os.path.join(
        raw_data_path,
        'huge_stock_market_dataset',
        'Stocks',
        '_all_stocks.csv'
    ),
    sep=';',
    decimal=','
)

In [6]:
df_stock_date = df_stock_market_raw.groupby('Stock', as_index=False)['Date'].count()

In [7]:
df_stocks_1y = df_stock_date[
    df_stock_date['Date'] >= 252
].sort_values('Date')

df_stocks_max_date = df_stock_market_raw[
    df_stock_market_raw['Date'] == df_stock_market_raw['Date'].max()
]

df_stocks_result = df_stock_market_raw[
    (df_stock_market_raw['Stock'].isin(df_stocks_1y['Stock']))
    & (df_stock_market_raw['Stock'].isin(df_stocks_max_date['Stock']))
    & (df_stock_market_raw['Stock'].isin(df_sp500_raw['Symbol']))
]

In [8]:
df_stocks_industry = df_stocks_result.merge(
    df_nasdaq_screener_filter,
    left_on='Stock',
    right_on='Symbol',
    how='left'
)

In [9]:
df_stocks_industry.groupby('Sector', as_index=False)['Stock'].count().sort_values('Stock')

Unnamed: 0,Sector,Stock
0,Basic Materials,26428
9,Telecommunications,39165
7,Real Estate,96331
3,Energy,108851
2,Consumer Staples,128489
10,Utilities,229587
5,Health Care,282974
8,Technology,311499
4,Finance,324280
6,Industrials,350222


In [10]:
df_industry = (
    df_stocks_industry
    .drop_duplicates('Stock')
    .groupby(
        ['Sector', 'Industry'],
        as_index=False
    )['Stock'].count()
)

In [11]:
df_industry.sort_values("Stock")

Unnamed: 0,Sector,Industry,Stock
0,Basic Materials,Metal Mining,1
34,Consumer Discretionary,Paints/Coatings,1
35,Consumer Discretionary,Professional Services,1
37,Consumer Discretionary,Recreational Games/Products/Toys,1
39,Consumer Discretionary,Retail-Auto Dealers and Gas Stations,1
...,...,...,...
99,Technology,Computer Software: Prepackaged Software,15
113,Utilities,Electric Utilities: Central,15
59,Finance,Major Banks,17
85,Industrials,Industrial Machinery/Components,19


In [12]:
df_industry[df_industry['Stock'] > 10]

Unnamed: 0,Sector,Industry,Stock
11,Consumer Discretionary,Business Services,14
47,Consumer Staples,Packaged Foods,11
56,Finance,Investment Bankers/Brokers/Service,11
59,Finance,Major Banks,17
73,Health Care,Medical/Dental Instruments,12
85,Industrials,Industrial Machinery/Components,19
97,Real Estate,Real Estate Investment Trusts,26
99,Technology,Computer Software: Prepackaged Software,15
109,Technology,Semiconductors,15
113,Utilities,Electric Utilities: Central,15


In [13]:
companies = ['MSFT', 'AAPL', 'NVDA', 'GOOGL', 'GOOG', 'AMZN']

df_stock_market_raw[
    (df_stock_market_raw['Stock'].isin(companies))
    & (df_stock_market_raw['Date'].between('2008-01-01', '2016-12-31'))
]

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Stock
44968,2008-01-02,25.547,25.646,24.658,24.951,300780650,0,AAPL
44969,2008-01-03,25.034,25.278,24.675,24.964,234652068,0,AAPL
44970,2008-01-04,24.527,24.715,22.910,23.057,405631839,0,AAPL
44971,2008-01-07,23.213,23.510,21.799,22.748,577574252,0,AAPL
44972,2008-01-08,23.060,23.368,21.874,21.932,424241367,0,AAPL
...,...,...,...,...,...,...,...,...
9884768,2016-12-23,106.050,109.480,105.940,109.380,15443269,0,NVDA
9884769,2016-12-27,111.700,116.990,110.860,116.950,29624523,0,NVDA
9884770,2016-12-28,119.220,119.540,108.340,108.900,56636762,0,NVDA
9884771,2016-12-29,104.570,111.130,102.470,111.070,53982799,0,NVDA


In [14]:
df_stock_market_raw[
    (df_stock_market_raw['Stock'].isin(companies))
    & (df_stock_market_raw['Date'].between('2008-01-01', '2016-12-31'))
].groupby("Stock")['Stock'].count()

Stock
AAPL     2267
AMZN     2267
GOOG      698
GOOGL    2267
MSFT     2267
NVDA     2267
Name: Stock, dtype: int64

# Daily News

In [15]:
df_daily_news_raw = pd.read_csv(
    os.path.join(
        raw_data_path,
        'daily_news_for_stock_market_prediction',
        'RedditNews.csv'
    )
)

In [16]:
df_daily_news_raw

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...
...,...,...
73603,2008-06-08,b'Man goes berzerk in Akihabara and stabs ever...
73604,2008-06-08,b'Threat of world AIDS pandemic among heterose...
73605,2008-06-08,b'Angst in Ankara: Turkey Steers into a Danger...
73606,2008-06-08,"b""UK: Identity cards 'could be used to spy on ..."
