2009-05-27

In [2]:
# Import packages
import pandas as pd

from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline
from transformers import AutoTokenizer

2024-06-11 09:42:03.963436: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-11 09:42:03.964472: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-11 09:42:04.033114: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-11 09:42:04.276355: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read raw data
data = pd.read_csv('../data/raw/analyst_ratings_processed.csv', index_col=0)

data['date'] = data['date'].reset_index(drop=True).dropna()

# Select only rows that contains a date in the date column
data = data[
    (data['date'].astype(str).str.contains('1|2'))
]

# Change column type
data['date'] = pd.to_datetime(data['date'])

# Filter date range
data = data[
    data['date'].astype(str).between('2008-06-08','2016-07-01')
]

In [36]:
# Change column type
data['date'] = pd.to_datetime(data['date'], utc=True)

# Subtract 4 hours to keep the date in the correct timezone
data['time'] = data['date'] - pd.DateOffset(hours=4)

# Extract hour, minute and full time from the date column
data['hour'] = data['time'].dt.hour
data['minute'] = data['time'].dt.minute
data['time'] = data['time'].dt.time

In [None]:
def replace_time(x):
    # If time is the first half hour, keep the same hour
    if 0 <= x['minute'] <= 30:
        return x['hour']

    # If the hour is 23 return 0 (midnight)
    if x['hour'] == 23:
        return 0

    # Otherwise, return the hour +1
    return x['hour'] + 1

In [37]:
# Apply replace time to keep the same pattern as in the stock prices dataset
data['hour_aux'] = data.apply(lambda x: replace_time(x), axis=1)

# Create pattern
data['datetime'] = (
    pd.to_datetime(data['date'].dt.date)
    + pd.to_timedelta(data['hour_aux'], unit='h')
).apply(lambda t: t.replace(minute=30))

In [102]:
# Select columns
data = data[['title', 'date', 'stock', 'datetime']]

In [103]:
# Check dataset
print(data.shape)
print(data.info())

(797189, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 797189 entries, 315.0 to 1400468.0
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype              
---  ------    --------------   -----              
 0   title     797189 non-null  object             
 1   date      797189 non-null  datetime64[ns, UTC]
 2   stock     796479 non-null  object             
 3   datetime  797189 non-null  datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), object(2)
memory usage: 30.4+ MB
None


In [104]:
data.head(2)

Unnamed: 0,title,date,stock,datetime
315.0,Agilent Tech Reports Q3 Adj. EPS $0.49 vs $0.4...,2016-06-27 20:09:00+00:00,A,2016-06-27 16:30:00
316.0,12 Stocks You Should Be Watching Today,2016-06-08 11:10:00+00:00,A,2016-06-08 07:30:00


In [112]:
# Define if is necessary to extract a sample from the current datetime
# The goal is reducing the dataset

# Group by datetime and count news
data_grouped = data.groupby(
    'datetime', as_index=False
)['title'].count()

# Create sample columns
data_grouped['sample'] = False

# Set sample = True if there are more than 3 news in the given datetime
data_grouped.loc[data_grouped['title'] > 3, 'sample'] = True

In [117]:
# Extract values from datetime and sample columns
datetime_values = data_grouped['datetime'].values
sample_values = data_grouped['sample'].values

# Create dict with above columns' values
date_sample_dict = dict(zip(datetime_values, sample_values)).items()

# Initialize variable
data_sample = pd.DataFrame()

# Iterate through the dataframe to extract samples
for date, sample in date_sample_dict:
    # Filter dataframe of the specified datetime
    df = data[data['datetime'] == date]

    # If is required to create a sample, extract 3 news
    if sample:
        df = df.sample(3)

    # Concat dataframes
    data_sample = pd.concat([data_sample, df])

2009-02-14T15:30:00.000000000 False
2009-04-27T15:30:00.000000000 False
2009-04-29T09:30:00.000000000 False
2009-05-22T14:30:00.000000000 False
2009-05-27T04:30:00.000000000 False
2009-05-28T22:30:00.000000000 False
2009-05-29T08:30:00.000000000 False
2009-05-30T10:30:00.000000000 False
2009-06-01T11:30:00.000000000 False
2009-06-02T14:30:00.000000000 False
2009-06-02T22:30:00.000000000 True
2009-06-06T00:30:00.000000000 True
2009-06-06T23:30:00.000000000 False
2009-06-08T19:30:00.000000000 False
2009-06-10T00:30:00.000000000 True
2009-06-10T23:30:00.000000000 False
2009-06-15T10:30:00.000000000 False
2009-06-16T08:30:00.000000000 False
2009-06-16T14:30:00.000000000 False
2009-06-16T22:30:00.000000000 False
2009-06-20T23:30:00.000000000 True
2009-06-23T21:30:00.000000000 False
2009-06-29T00:30:00.000000000 False
2009-07-01T21:30:00.000000000 False
2009-07-07T12:30:00.000000000 False
2009-07-07T16:30:00.000000000 False
2009-07-08T13:30:00.000000000 False
2009-07-10T10:30:00.000000000 Fa

In [132]:
# # Save result

# data_sample.to_csv('../data/processed/analyst_ratings_sample.csv', index=False)

## Pré-processamento

## Modelo - Roberta Finetued Financial News Sentiment Analysis

In [137]:
# Load model
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [138]:
# Create pipeline
sentiment_task = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name)
sentiment_task('tokens')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


[{'label': 'neutral', 'score': 0.9998683929443359}]

## Aplicação do modelo ao dataset


In [140]:
# Create function to apply the model
def analyze(text):
    result = sentiment_task(text)
    return result[0]

# Select the required date range
data = data_sample[data_sample['datetime']>='2008-06-01']

In [152]:
# Reset dataframe's index
data.reset_index(drop=True, inplace=True)

# Apply sentiment analysis model in the news' title
data_sentiment = data['title'].apply(analyze)

CPU times: user 3h 37min 52s, sys: 27min 28s, total: 4h 5min 21s
Wall time: 2h 50min 28s


In [154]:
# Extract lable
data_sentiment_label = data_sentiment.apply(lambda x: x['label'])

# Extract score
data_sentiment_score = data_sentiment.apply(
    lambda x:
        round(x.get('score', 0), 2)
        if isinstance(x, dict)
        else None
)

In [156]:
# Create dataframe with label and score columns
zipped = list(zip(data_sentiment_label, data_sentiment_score))
zipped_df = pd.DataFrame(zipped, columns=['Label', 'Score'])

In [161]:
# Concat dataframes to add label and score columns
data_df = pd.concat([data, zipped_df], axis=1, join='outer', ignore_index=False)
data_df.drop('analyze', axis=1, inplace=True)
data_df.shape

(92946, 6)

In [163]:
# # Save
# data_df.to_csv('../data/processed/analyst_ratings_score.csv',index=False)