In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os

In [7]:
os.chdir('/content/drive/MyDrive/projects/Combify')

In [2]:
import pandas as pd
import numpy as np

In [1]:
raw_data_file = 'raw_analyst_ratings.csv.zip'

In [42]:
df = pd.read_csv(raw_data_file).drop(columns='Unnamed: 0')

In [43]:
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   headline   1407328 non-null  object
 1   url        1407328 non-null  object
 2   publisher  1407328 non-null  object
 3   date       1407328 non-null  object
 4   stock      1407328 non-null  object
dtypes: object(5)
memory usage: 53.7+ MB


In [45]:
df.headline.count()

1407328

There is no Null value in dataset

## Get rid out of duplicates

In [46]:
df.drop_duplicates(['stock', 'url'], inplace=True)

In [47]:
df.headline.count()

1407326

## Labeling

In [14]:
from tqdm import tqdm
from datetime import date, timedelta

### Getting Stock history

In [15]:
all_tickers = df['stock'].unique()

In [16]:
tickers_history_dict = {}

In [17]:
import pickle

In [None]:
file_to_write = open("stocks.pickle", "wb")

pickle.dump(tickers_history_dict, file_to_write)

file_to_write.close()

In [18]:
w = open("stocks.pickle", 'rb')
d = pickle.load(w)

In [19]:
d['A'].loc['2009-02-13']

Open            1.217092e+01
High            1.244619e+01
Low             1.203984e+01
Close           1.210538e+01
Volume          4.232500e+06
Dividends       0.000000e+00
Stock Splits    0.000000e+00
Name: 2009-02-13 00:00:00, dtype: float64

In [20]:
ALL_TICKERS = d.keys()

In [26]:
import datetime
import holidays

ONE_DAY = datetime.timedelta(days=1)
HOLIDAYS_US = holidays.US()

def next_business_day(today_date):
    next_day = today_date + ONE_DAY
    while next_day.weekday() in holidays.WEEKEND or next_day in HOLIDAYS_US:
        next_day += ONE_DAY
    return next_day.strftime('%Y-%m-%d')


def get_stock_status(row, stock_history):
    global TICKERS_HISTORY
    ticker = row['stock']
    today = row['date'].split()[0]
    today_date = date.fromisoformat(today)
    next_day = next_business_day(today_date)
    if ticker in ALL_TICKERS:
        ticker_history = stock_history[ticker]
        if today in ticker_history.index and next_day in ticker_history.index:
            today_close_price = ticker_history.loc[today]['Close']
            tomorrow_close_price = ticker_history.loc[next_day]['Close']
            if today_close_price < tomorrow_close_price:
                # Raise
                return 1
            else:
                # Fall
                return 0
    return np.nan

In [24]:
tqdm.pandas()

  from pandas import Panel


In [None]:
labels = df.progress_apply(lambda row: get_stock_status(row, d), axis=1, result_type='expand')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 18%|█▊        | 256958/1407326 [04:14<28:19, 677.00it/s][A
 18%|█▊        | 257027/1407326 [04:15<28:14, 678.72it/s][A
 18%|█▊        | 257100/1407326 [04:15<27:39, 693.13it/s][A
 18%|█▊        | 257171/1407326 [04:15<27:27, 697.99it/s][A
 18%|█▊        | 257241/1407326 [04:15<28:14, 678.70it/s][A
 18%|█▊        | 257310/1407326 [04:15<28:41, 667.92it/s][A
 18%|█▊        | 257388/1407326 [04:15<27:30, 696.72it/s][A
 18%|█▊        | 257459/1407326 [04:15<27:36, 694.11it/s][A
 18%|█▊        | 257529/1407326 [04:15<28:39, 668.79it/s][A
 18%|█▊        | 257597/1407326 [04:15<28:56, 662.11it/s][A
 18%|█▊        | 257665/1407326 [04:16<28:48, 665.00it/s][A
 18%|█▊        | 257732/1407326 [04:16<28:45, 666.29it/s][A
 18%|█▊        | 257806/1407326 [04:16<27:56, 685.83it/s][A
 18%|█▊        | 257876/1407326 [04:16<27:48, 688.79it/s][A
 18%|█▊        | 257946/1407326 [04:16<29:15, 654.85it/s][A
 18%|█▊        | 258

In [30]:
labels

1052624

In [48]:
df['increased'] = labels

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1407326 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   headline   1407326 non-null  object 
 1   url        1407326 non-null  object 
 2   publisher  1407326 non-null  object 
 3   date       1407326 non-null  object 
 4   stock      1407326 non-null  object 
 5   increased  1052624 non-null  float64
dtypes: float64(1), object(5)
memory usage: 75.2+ MB


In [50]:
labeled_df = df.dropna()

In [51]:
labeled_df.to_csv('labeled.csv', index=False)