In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import tensorflow as tf
import pmaw
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dotenv import load_dotenv

from libs.prep_methods import articles_pull, subreddit_pull, keyword_filter, articles_vader_analyzer, reddit_vader_analyzer, daily_mean

In [2]:
# stockmarket_comments = subreddit_pull('stockmarket', limit = 10000, after = int(dt.datetime(2012, 1, 1, 0, 0).timestamp()), before = int(dt.datetime(2022, 6, 1, 0, 0).timestamp()))

# securityanalysis_comments = subreddit_pull('securityanalysis', limit = 10000000, after = int(dt.datetime(2012, 1, 1, 0, 0).timestamp()), before = int(dt.datetime(2022, 6, 1, 0, 0).timestamp()))

# algotrading_comments = subreddit_pull('algotrading', limit = 10000000, after = int(dt.datetime(2012, 1, 1, 0, 0).timestamp()), before = int(dt.datetime(2022, 6, 1, 0, 0).timestamp()))

# wallstreetbets_comments = subreddit_pull('wallstreetbets', limit = 10000000, after = int(dt.datetime(2012, 1, 1, 0, 0).timestamp()), before = int(dt.datetime(2022, 6, 1, 0, 0).timestamp()))

# stockmarket_comments.to_csv('./Data/Cleaned_Data/stockmarket_comments.csv')
# securityanalysis_comments.to_csv('../../sandbox/securityanalysis_comments_large.csv')
# algotrading_comments.to_csv('../../sandbox/algotrading_comments_large.csv')
# wallstreetbets_comments.to_csv('./wallstreetbets_comments_large.csv')

In [3]:
stock_data = pd.read_csv('./Data/Cleaned_Data/stock_data.csv', parse_dates = True, infer_datetime_format = True)
stock_data['date'] = pd.to_datetime(stock_data['date'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
stock_data = stock_data.set_index('date')
stock_data.index.name = None

In [4]:
stockmarket_comments = pd.read_csv('../../sandbox/stockmarket_comments_large.csv', lineterminator = '\n', parse_dates = True, infer_datetime_format = True)

In [5]:
keywords = {
'NFLX': ['NFLX', 'nflx', 'Netflix', 'netflix'],
'FB': ['FB', 'fb', 'Facebook', 'facebook'],
'UBER': ['UBER', 'uber', 'Uber'],
'MCHP': ['MCHP', 'mchp', 'Microchip Technology'],
'ABNB': ['ABNB', 'abnb', 'AirBnB', 'airbnb'],
'FANG': ['FANG', 'fang', 'Diamondback Energy', 'diamondback energy', 'Diamondback', 'diamondback'],
'MRO': ['MRO', 'mro', 'Marathon Oil', 'marathon oil'],
'DVN': ['DVN', 'dvn', 'Devon Energy', 'devon energy'],
'SPWR': ['SPWR', 'spwr', 'SunPower', 'Sunpower', 'sunpower'],
'REGI': ['REGI', 'regi', 'Renewable Energy Group', 'renewable energy group'],
'MTRX': ['MTRX', 'mtrx', 'McKinsey & Company', 'McKinsey & Co', 'Mckinsey & Co', 'McKinsey', 'Mckinsey', 'mckinsey'],
'BLK': ['BLK', 'blk', 'BlackRock', 'Blackrock', 'blackrock'],
'PYPL': ['PYPL', 'pypl', 'PayPal', 'Paypal', 'paypal'],
'MELI': ['MELI', 'meli', 'MercadoLibre', 'Mercadolibre', 'mercadolibre'],
'SOFI': ['SOFI', 'sofi', 'SoFi', 'Sofi']
}

In [6]:
# asset_dataframe_list = []
# for asset in keywords:

#     asset_sec_sentiment = pd.read_csv(f'./Data/Cleaned_Data/SEC_sentiment_and_STOCKS/{asset}.csv', parse_dates = True, infer_datetime_format = True)
#     asset_sec_sentiment = pd.read_csv(f'./Data/Cleaned_Data/SEC_sentiment_and_STOCKS/UBER.csv', parse_dates = True, infer_datetime_format = True)
#     asset_sec_sentiment['date'] = pd.to_datetime(asset_sec_sentiment['date'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
#     asset_sec_sentiment = asset_sec_sentiment.set_index('date')
#     asset_sec_sentiment.index.name = None

#     asset_sec_sentiment = asset_sec_sentiment[['compound', 'pos', 'neu', 'neg']]
#     asset_sec_sentiment = asset_sec_sentiment.rename(columns = {'pos': 'sec_positive_sentiment', 'neg': 'sec_negative_sentiment', 'neu': 'sec_neutral sentiment', 'compound': 'sec_compound_sentiment'})

#     asset_stockmarket_comments = keyword_filter(stockmarket_comments.fillna(''), keywords[asset])
#     asset_stockmarket_sentiment = daily_mean(reddit_vader_analyzer('stockmarket', asset_stockmarket_comments)).ffill()
#     asset_stockmarket_sentiment['datetime'] = pd.to_datetime(asset_stockmarket_sentiment['datetime'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
#     asset_stockmarket_sentiment = asset_stockmarket_sentiment.set_index('datetime')
#     asset_stockmarket_sentiment.index = asset_stockmarket_sentiment.index.date

#     asset_prices_volume = stock_data[stock_data['ticker'] == asset].drop(columns = 'ticker')
#     asset_prices_volume = asset_prices_volume[['volume', 'close']]

#     asset_dataframe = pd.concat([asset_sec_sentiment, asset_stockmarket_sentiment, asset_prices_volume], axis = 1).ffill().dropna()
#     asset_dataframe.to_csv(f'./Data/Cleaned_Data/{asset}.csv')

#     asset_dataframe_list.append(asset_dataframe)

In [7]:
asset_dataframe_list = []
for asset in keywords:

    asset_dataframe = pd.read_csv(f'./Data/Cleaned_Data/{asset}.csv', parse_dates = True, infer_datetime_format = True)
    asset_dataframe['Unnamed: 0'] = pd.to_datetime(asset_dataframe['Unnamed: 0'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
    asset_dataframe = asset_dataframe.set_index('Unnamed: 0')
    asset_dataframe.index.name = None
    
    asset_dataframe_list.append(asset_dataframe)

In [8]:
netflix = pd.DataFrame(asset_dataframe_list[0])
facebook = pd.DataFrame(asset_dataframe_list[1])
uber = pd.DataFrame(asset_dataframe_list[2])
microchip_technology = pd.DataFrame(asset_dataframe_list[3])
airbnb = pd.DataFrame(asset_dataframe_list[4])
diamondback_energy = pd.DataFrame(asset_dataframe_list[5])
marathon_oil = pd.DataFrame(asset_dataframe_list[6])
devon_energy = pd.DataFrame(asset_dataframe_list[7])
sunpower_corp = pd.DataFrame(asset_dataframe_list[8])
renewable_energy_group = pd.DataFrame(asset_dataframe_list[9])
mckinsey_and_company = pd.DataFrame(asset_dataframe_list[10])
blackrock = pd.DataFrame(asset_dataframe_list[11])
paypal = pd.DataFrame(asset_dataframe_list[12])
mercado_libre = pd.DataFrame(asset_dataframe_list[13])
sofi = pd.DataFrame(asset_dataframe_list[14])

In [9]:
# uber_financial_metrics = pd.read_csv(f'./Data/Cleaned_Data/SEC_Fin_Data_and_STOCKS/UBER.csv', parse_dates = True, infer_datetime_format = True)
# uber_financial_metrics = uber_financial_metrics.rename(columns = {
#     'date': 'date',
#     'Revenues': 'revenue',
#     'CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization': 'cost_without_depletion_and_amortization', 'OperationsAndSupportExpense': 'operations_and_support_expense',
#     'SellingAndMarketingExpense': 'selling_and_market_expense',
#     'ResearchAndDevelopmentExpense': 'research_and_development_expense',
#     'GeneralAndAdministrativeExpense': 'general_and_administrative_expense',
#     'DepreciationDepletionAndAmortization': 'depreciation_depletion_and_amortization',
#     'CostsAndExpenses': 'costs_and_expenses',
#     'OperatingIncomeLoss': 'operating_income_loss',
#     'InterestExpense': 'interest_expense',
#     'NonoperatingIncomeExpense': 'nonoperating_income_expense', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments': 'incomeloss_from_continuining_operations',
#     'IncomeTaxExpenseBenefit': 'income_tax_expense_benefit',
#     'IncomeLossFromEquityMethodInvestments': 'income_loss_from_equity_method_investments',
#     'ProfitLoss': 'profit_loss',
#     'Unnamed: 16': 'unnamed',
#     'NetIncomeLoss': 'net_income_loss',
#     'EarningsPerShareBasic': 'earnings_per_share_basic',
#     'EarningsPerShareDiluted': 'earnings_per_share_diluted',
#     'WeightedAverageNumberOfSharesOutstandingBasic': 'weighted_average_number_of_shares_outstanding_basic', 
#     'WeightedAverageNumberOfDilutedSharesOutstanding': 'weight_average_number_of_diluted_shares_outstanding', 
#     'NetIncomeLossAttributableToRedeemableNoncontrollingInterest': 'net_income_loss_attributable_to_redeemable_noncontrolling_interest'
#     })

# uber_financial_metrics = uber_financial_metrics.drop(columns = ['ticker', 'close', 'volume']) 
# uber_financial_metrics['date'] = pd.to_datetime(uber_financial_metrics['date'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
# uber_financial_metrics = uber_financial_metrics.set_index('date')
# uber_financial_metrics.index = uber_financial_metrics.index.date
# uber_financial_metrics.index.name = None

# uber_with_sec = pd.concat([uber_financial_metrics, uber], axis = 1).ffill().dropna()
# uber_with_sec.to_csv(f'./Data/Cleaned_Data/UBER_WITH_SEC.csv')

In [10]:
# airbnb_financial_metrics = pd.read_csv(f'./Data/Cleaned_Data/SEC_Fin_Data_and_STOCKS/ABNB.csv', parse_dates = True, infer_datetime_format = True)
# airbnb_financial_metrics = airbnb_financial_metrics.rename(columns = {
#     'date': 'date',
#     'Revenues': 'revenue',
#     'CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization': 'cost_without_depletion_and_amortization', 'OperationsAndSupportExpense': 'operations_and_support_expense',
#     'SellingAndMarketingExpense': 'selling_and_market_expense',
#     'ResearchAndDevelopmentExpense': 'research_and_development_expense',
#     'GeneralAndAdministrativeExpense': 'general_and_administrative_expense',
#     'DepreciationDepletionAndAmortization': 'depreciation_depletion_and_amortization',
#     'CostsAndExpenses': 'costs_and_expenses',
#     'OperatingIncomeLoss': 'operating_income_loss',
#     'InterestExpense': 'interest_expense',
#     'NonoperatingIncomeExpense': 'nonoperating_income_expense', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments': 'incomeloss_from_continuining_operations',
#     'IncomeTaxExpenseBenefit': 'income_tax_expense_benefit',
#     'IncomeLossFromEquityMethodInvestments': 'income_loss_from_equity_method_investments',
#     'ProfitLoss': 'profit_loss',
#     'Unnamed: 16': 'unnamed',
#     'NetIncomeLoss': 'net_income_loss',
#     'EarningsPerShareBasic': 'earnings_per_share_basic',
#     'EarningsPerShareDiluted': 'earnings_per_share_diluted',
#     'WeightedAverageNumberOfSharesOutstandingBasic': 'weighted_average_number_of_shares_outstanding_basic', 
#     'WeightedAverageNumberOfDilutedSharesOutstanding': 'weight_average_number_of_diluted_shares_outstanding', 
#     'NetIncomeLossAttributableToRedeemableNoncontrollingInterest': 'net_income_loss_attributable_to_redeemable_noncontrolling_interest'
#     })

# airbnb_financial_metrics = airbnb_financial_metrics.drop(columns = ['ticker', 'close', 'volume']) 
# airbnb_financial_metrics['date'] = pd.to_datetime(airbnb_financial_metrics['date'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
# airbnb_financial_metrics = airbnb_financial_metrics.set_index('date')
# airbnb_financial_metrics.index = airbnb_financial_metrics.index.date
# airbnb_financial_metrics.index.name = None

# airbnb_with_sec = pd.concat([airbnb_financial_metrics, airbnb], axis = 1).ffill().dropna()
# airbnb_with_sec.to_csv(f'./Data/Cleaned_Data/ABNB_WITH_SEC.csv')

In [11]:
uber_with_sec = pd.read_csv(f'./Data/Cleaned_Data/UBER_WITH_SEC.csv', parse_dates = True, infer_datetime_format = True)
uber_with_sec['Unnamed: 0'] = pd.to_datetime(uber_with_sec['Unnamed: 0'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
uber_with_sec = uber_with_sec.set_index('Unnamed: 0')
uber_with_sec.index.name = None

In [12]:
airbnb_with_sec = pd.read_csv(f'./Data/Cleaned_Data/ABNB_WITH_SEC.csv', parse_dates = True, infer_datetime_format = True)
airbnb_with_sec['Unnamed: 0'] = pd.to_datetime(airbnb_with_sec['Unnamed: 0'], infer_datetime_format = True, errors = 'coerce', format = '%Y/%m/%d')
airbnb_with_sec = airbnb_with_sec.set_index('Unnamed: 0')
airbnb_with_sec.index.name = None