In [23]:
# Import packages
import pandas as pd
import os

In [24]:
# Path to ./data/processed folder
path_data_processed = os.path.join(
    os.path.abspath('../'),
    'data',
    'processed'
)

# Load dataset

In [25]:
# Read dataframe with score per news
# The score is related to the sentiment analysis model
df_raw = pd.read_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_score.csv'
    )
)

df_raw = df_raw.rename({'datetime': 'Date'}, axis=1)

In [26]:
# Select only negative and positive labels
df_wo_neutral = df_raw[
    (df_raw['Label'].str.contains('negative'))
    | (df_raw['Label'].str.contains('positive'))
]

# % of News per Label per Date

## With "neutral"

In [27]:
# Group by date and label and compute number of news
df_count = df_raw.groupby(
    ['Date', 'Label']
).size().unstack(fill_value=0)

# Divide by 25, wich is the number of news per day
df_pct_news = df_count.div(25, axis=0)

# Reset index
df_pct_news = df_pct_news.reset_index(drop=False)

In [28]:
# Convert Date colum to datetime
df_pct_news['Date'] = pd.to_datetime(df_pct_news['Date'])

# Save
df_pct_news.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_pct_of_news.csv'
    ),
    index=False
)

## Without "neutral"

In [29]:
# Group by date and label and compute number of news
df_pct_news_wo_neutral = df_wo_neutral.groupby(
    ['Date', 'Label']
).size().unstack(fill_value=0)

# Compute total number of news
df_pct_news_wo_neutral['Total'] = (
    df_pct_news_wo_neutral['negative']
    + df_pct_news_wo_neutral['positive']
)

# Calculate percentage of positive news
df_pct_news_wo_neutral['positive'] = (
    df_pct_news_wo_neutral['positive']
    / df_pct_news_wo_neutral['Total']
)

# Calculate percentage of negative news
df_pct_news_wo_neutral['negative'] = (
    df_pct_news_wo_neutral['negative']
    / df_pct_news_wo_neutral['Total']
)

# Reset index
df_pct_news_wo_neutral = df_pct_news_wo_neutral.reset_index(drop=False)

In [30]:
# Convert Date colum to datetime
df_pct_news_wo_neutral['Date'] = pd.to_datetime(df_pct_news_wo_neutral['Date'])

# Save
df_pct_news_wo_neutral.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_pct_of_news_wo_neutral.csv'
    ),
    index=False
)

# Average Score per Date

## With "neutral"

In [31]:
# Group by date and compute average score
df_avg_score = df_raw.groupby(
    'Date', as_index=False
)['Score'].mean()

In [32]:
# Convert Date colum to datetime
df_avg_score['Date'] = pd.to_datetime(df_avg_score['Date'])

# Save
df_avg_score.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_avg_score.csv'
    ),
    index=False
)

## Without "neutral"

In [33]:
# Group by date and compute average score
df_avg_score_wo_neutral = df_wo_neutral.groupby(
    'Date', as_index=False
)['Score'].mean()

In [34]:
# Convert Date colum to datetime
df_avg_score_wo_neutral['Date'] = pd.to_datetime(df_avg_score_wo_neutral['Date'])

# Save
df_avg_score_wo_neutral.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_avg_score_wo_neutral.csv'
    ),
    index=False
)

# Average Score per Label and Date

## With "neutral"

In [35]:
# Pivot table to convert lables into columns
# and compute the average score
df_avg_score_per_label = df_raw.pivot_table(
    values='Score',
    columns=['Label'],
    index='Date',
    aggfunc='mean'
).reset_index(drop=False)

# Convert Date colum to datetime
df_avg_score_per_label['Date'] = pd.to_datetime(df_avg_score_per_label['Date'])

# Save result as csv
df_avg_score_per_label.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_avg_score_per_label.csv'
    ),
    index=False
)

## Without "neutral"

In [36]:
# Pivot table to convert lables into columns
# and compute the average score
df_avg_score_per_label_wo_neutral = df_wo_neutral.pivot_table(
    values='Score',
    columns=['Label'],
    index='Date',
    aggfunc='mean'
).reset_index(drop=False)

# Convert Date colum to datetime
df_avg_score_per_label_wo_neutral['Date'] = pd.to_datetime(df_avg_score_per_label_wo_neutral['Date'])

# Save result as csv
df_avg_score_per_label_wo_neutral.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_avg_score_per_label_wo_neutral.csv'
    ),
    index=False
)

# Weighted Average per Label and Date

## With "neutral"

In [37]:
# Group by to count number of news per date and lable
df_count_label = df_raw.groupby(
    ['Date', 'Label'], as_index=False
)['title'].count()

# Pivot table to convert lables into columns
df_count_label = df_count_label.pivot_table(
    values='title',
    columns=['Label'],
    index='Date'
).reset_index(drop=False)

# Fill null values with 0
df_count_label = df_count_label.fillna(0)

# Rename columns
df_count_label = df_count_label.rename(
    {
        'negative': 'n_negative',
        'neutral': 'n_neutral',
        'positive': 'n_positive',
    },
    axis=1
)

# Convert Date colum to datetime
df_count_label['Date'] = pd.to_datetime(df_count_label['Date'])


In [38]:
# Rename columns of df_avg dataset to keep
# the columns names pattern
df_avg = df_avg_score_per_label.rename(
    {
        'negative': 'score_negative',
        'neutral': 'score_neutral',
        'positive': 'score_positive',
    },
    axis=1
)

# Merge df_avg with counting of news per lable
# to calculate weighted average score
df_weighted_avg = df_avg.merge(
    df_count_label,
    on='Date',
    how='left'
)

# Compute total number of news per date
df_weighted_avg['n_total'] = (
    df_weighted_avg['n_negative']
    + df_weighted_avg['n_neutral']
    + df_weighted_avg['n_positive']
)

# Calculate weighted average score
# average_score * (number_of_news / total_number_of_news)
df_weighted_avg['negative'] = (
    df_weighted_avg['score_negative']
     * (df_weighted_avg['n_negative']
      / df_weighted_avg['n_total']
    )
)

df_weighted_avg['neutral'] = (
    df_weighted_avg['score_neutral']
     * (df_weighted_avg['n_neutral']
      / df_weighted_avg['n_total']
    )
)

df_weighted_avg['positive'] = (
    df_weighted_avg['score_positive']
     * (df_weighted_avg['n_positive']
      / df_weighted_avg['n_total']
    )
)

# Select columns to keep
df_weighted_avg = df_weighted_avg[['Date', 'negative', 'neutral', 'positive']]

# Convert Date colum to datetime
df_weighted_avg['Date'] = pd.to_datetime(df_weighted_avg['Date'])

# Save as csv
df_weighted_avg.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_weighted_avg_per_label.csv'
    ),
    index=False
)

## Without "neutral"

In [39]:
# Group by to count number of news per date and lable
df_count_label_wo_neutral = df_wo_neutral.groupby(
    ['Date', 'Label'], as_index=False
)['title'].count()

# Pivot table to convert lables into columns
df_count_label_wo_neutral = df_count_label_wo_neutral.pivot_table(
    values='title',
    columns=['Label'],
    index='Date'
).reset_index(drop=False)

# Fill null values with 0
df_count_label_wo_neutral = df_count_label_wo_neutral.fillna(0)

# Rename columns
df_count_label_wo_neutral = df_count_label_wo_neutral.rename(
    {
        'negative': 'n_negative',
        'positive': 'n_positive',
    },
    axis=1
)

# Convert Date colum to datetime
df_count_label_wo_neutral['Date'] = pd.to_datetime(df_count_label_wo_neutral['Date'])

In [40]:
# Rename columns of df_avg dataset to keep
# the columns names pattern
df_avg_wo_neutral = df_avg_score_per_label_wo_neutral.rename(
    {
        'negative': 'score_negative',
        'positive': 'score_positive',
    },
    axis=1
)

# Merge df_avg with counting of news per lable
# to calculate weighted average score
df_weighted_avg_wo_neutral = df_avg_wo_neutral.merge(
    df_count_label_wo_neutral,
    on='Date',
    how='left'
)

# Compute total number of news per date
df_weighted_avg_wo_neutral['n_total'] = (
    df_weighted_avg_wo_neutral['n_negative']
    + df_weighted_avg_wo_neutral['n_positive']
)

# Calculate weighted average score
# average_score * (number_of_news / total_number_of_news)
df_weighted_avg_wo_neutral['negative'] = (
    df_weighted_avg_wo_neutral['score_negative']
    * (
        df_weighted_avg_wo_neutral['n_negative']
         / df_weighted_avg_wo_neutral['n_total']
    )
)

df_weighted_avg_wo_neutral['positive'] = (
    df_weighted_avg_wo_neutral['score_positive']
    * (
        df_weighted_avg_wo_neutral['n_positive']
         / df_weighted_avg_wo_neutral['n_total']
    )
)

# Select columns to keep
df_weighted_avg_wo_neutral = df_weighted_avg_wo_neutral[['Date', 'negative', 'positive']]

# Convert Date colum to datetime
df_weighted_avg_wo_neutral['Date'] = pd.to_datetime(df_weighted_avg_wo_neutral['Date'])

# Save as csv
df_weighted_avg_wo_neutral.to_csv(
    os.path.join(
        path_data_processed,
        'analyst_ratings_weighted_avg_per_label_wo_neutral.csv'
    ),
    index=False)