In [1]:
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
data_path = '../data/'
raw_data_name = 'raw_analyst_ratings.csv.zip'
raw_data_file = os.path.join(data_path, raw_data_name)

In [4]:
df = pd.read_csv(raw_data_file).drop(columns='Unnamed: 0')

In [5]:
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   headline   1407328 non-null  object
 1   url        1407328 non-null  object
 2   publisher  1407328 non-null  object
 3   date       1407328 non-null  object
 4   stock      1407328 non-null  object
dtypes: object(5)
memory usage: 53.7+ MB


In [45]:
df.headline.count()

1407328

There is no Null value in dataset

## Get rid out of duplicates

In [46]:
df.drop_duplicates(['stock', 'url'], inplace=True)

In [47]:
df.headline.count()

1407326

## Labeling

In [9]:
import pickle
from tqdm import tqdm
from datetime import date, timedelta
import yfinance as yf

### Getting Stock history

### Getting Stock history

In [16]:
all_tickers = df['stock'].unique()

In [17]:
pickle_filename = "stocks.pickle"
pickle_path = os.path.join(data_path, pickle_filename)
if not os.path.exists(pickle_path):
    tickers_history_dict = {}
    start_time = '2009-02-14'
    end_time = '2020-06-11'
    
    for ticker in tqdm(all_tickers):
        t = yf.Ticker(ticker)
        tickers_history_dict[ticker] = t.history(start=start_time, end=end_time)

    file_to_write = open(pickle_path), "wb")
    pickle.dump(tickers_history_dict, file_to_write)
    file_to_write.close()

In [10]:
p_file = open(pickle_path, 'rb')
stock_history = pickle.load(p_file)
p_file.close()

In [11]:
ALL_TICKERS = stock_history.keys()

In [26]:
import datetime
import holidays

ONE_DAY = datetime.timedelta(days=1)
HOLIDAYS_US = holidays.US()

def next_business_day(today_date):
    next_day = today_date + ONE_DAY
    while next_day.weekday() in holidays.WEEKEND or next_day in HOLIDAYS_US:
        next_day += ONE_DAY
    return next_day.strftime('%Y-%m-%d')


def get_stock_status(row, stock_history):
    global TICKERS_HISTORY
    ticker = row['stock']
    today = row['date'].split()[0]
    today_date = date.fromisoformat(today)
    next_day = next_business_day(today_date)
    if ticker in ALL_TICKERS:
        ticker_history = stock_history[ticker]
        if today in ticker_history.index and next_day in ticker_history.index:
            today_close_price = ticker_history.loc[today]['Close']
            tomorrow_close_price = ticker_history.loc[next_day]['Close']
            if today_close_price < tomorrow_close_price:
                # Raise
                return 1
            else:
                # Fall
                return 0
    return np.nan

In [24]:
tqdm.pandas()

  from pandas import Panel


In [None]:
labels = df.progress_apply(lambda row: get_stock_status(row, stock_history), axis=1, result_type='expand')

In [48]:
df['increased'] = labels

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1407326 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   headline   1407326 non-null  object 
 1   url        1407326 non-null  object 
 2   publisher  1407326 non-null  object 
 3   date       1407326 non-null  object 
 4   stock      1407326 non-null  object 
 5   increased  1052624 non-null  float64
dtypes: float64(1), object(5)
memory usage: 75.2+ MB


In [50]:
labeled_df = df.dropna()

In [51]:
labeled_filename = 'labeled.csv'
labeled_df.to_csv(os.path.join(data_path, labeled_filename), index=False)