In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
import praw
from pytrends.request import TrendReq
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import matplotlib.dates as mdates
from scipy import interpolate
import feedparser
from datetime import datetime, timedelta

In [3]:

import logging

logging.basicConfig(level=logging.INFO)

def setup_environment():
    nltk.download('punkt')
    nltk.download9('stopwords')

def load_stopwords(language='spanish'):
    return set(stopwords.words(language))

In [4]:
def get_stock_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date, interval='1d')
    data.reset_index(inplace=True)
    data['Date'] = pd.to_datetime(data['Date'])
    return data


In [5]:
def get_google_news_rss_feed(feed_url):
    #Fetches articles from a google news RSS feed URL

    feed = feedparser.parse(feed_url)
    if feed.bozo:
        print("Error parsing feed:", feed.bozo_exception)
        return []
    if not feed.entries:
        print("No entries found in the feed.")
        return []
    articles = []
    for entry in feed.entries:
        title = entry.title if 'title' in entry else 'No title'
        summary = entry.summary if 'summary' in entry else 'No summary'
        published = datetime(*entry.published_parsed[:6]) if 'published_parsed' in entry else None
        articles.append({
            'title': title,
            'content': summary,
            'date': published
        })
    return articles

In [6]:
def get_reddit_posts(subreddits, query, limit=10):
    posts = []
    for subreddit in subreddits:
        print(f"Searching in subreddit: {subreddit}")
        subreddit_obj = reddit.subreddit(subreddit)
        for submission in subreddit_obj.search(query, limit=limit):
            posts.append({
                'title': submission.title,
                'content': submission.selftext,
                'created': pd.to_datetime(submission.created_utc, unit='s')
            })
    return posts

In [7]:
def get_google_trends_data(keywords, timeframe='today 3-m'):
    pytrends = TrendReq(hl='es-MX', tz=360)
    pytrends.build_payload(keywords, cat=0, timeframe=timeframe, geo='MX', gprop='')
    data = pytrends.interest_over_time()
    return data

In [8]:

def preprocess_text(text, stop_words):
    if not isinstance(text, str):
        text = ''
    text = text.lower()
    text = re.sub(r'http\S+', '', text) # Removes URLs
    text = re.sub(r'[^a-záéíóúñ\s]', '', text) # Keep only letters
    tokens = word_tokenize(text, language='spanish')
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [9]:
"""Sentiment Analysis"""

from transformers import pipeline, BertForSequenceClassification, BertTokenizer

# Load the model, tokenizer, and pipeline once
stop_words = load_stopwords('spanish')
model = BertForSequenceClassification.from_pretrained(
    "ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3
)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer, device=0
)

def get_sentiment(text):
    if not text:
        return 0.0

    try:
        results = sentiment_pipeline(text[:512])[0]
        return results
    except Exception as e:
        print(f"Error processing text: {e}")
        return 0.0

In [10]:
"""Data aggregation and analysis"""
def aggregate_sentiment_by_date(df, sentiment_column):
    """Aggregate sentiment scores by date."""
    return df.groupby('Date')[sentiment_column].mean().reset_index()


In [11]:
# Fetch data
end_date = datetime.today()
start_date = end_date - timedelta(days=365)
stock_data= get_stock_data('^MXX', start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))

[*********************100%***********************]  1 of 1 completed


In [12]:
# Fetch news articles
bolsa_rss_url = "https://news.google.com/rss/search?q=Bolsa+Mexicana+de+Valores&hl=es-419&gl=MX&ceid=MX:es-419"
news_articles = get_google_news_rss_feed(bolsa_rss_url)
if news_articles:
    print(f"Collected {len(news_articles)} articles.")
else:
    print("No articles were collected.")

Collected 100 articles.


In [43]:
keywords = ['BMV', 'Acciones']
trends_data = get_google_trends_data(keywords)
trends_data.reset_index(inplace=True)

TooManyRequestsError: The request failed: Google returned a response with code 429

In [None]:
trends_data.reset_index(inplace=True)

    # Calcular el cambio porcentual para cada palabra clave
for keyword in keywords:
    trends_data[f'{keyword}_pct_change'] = trends_data[keyword].pct_change() * 100

significant_threshold = 40

    # Crear columnas que indiquen si hubo un aumento significativo
for keyword in keywords:
    trends_data[f'{keyword}_sig_increase'] = trends_data[f'{keyword}_pct_change'].apply(
        lambda x: 1 if x > significant_threshold else 0
    )
trend_sig_columns = [f'{keyword}_sig_increase' for keyword in keywords]
trends_data['sentiment_score'] = trends_data[trend_sig_columns].sum(axis=1)

trends_data['date'] = pd.to_datetime(trends_data['date']).dt.normalize()
trends_data.rename(columns={'date': 'Date'}, inplace=True)

In [None]:
reddit = praw.Reddit(
client_id="fbtWruG8aopQ5chxNLpURw",
client_secret="hGgoXi6sA0Qv0wgAezpdXqGvOgc_1Q",
user_agent="marketsentimentbmv"
)
reddit.read_only = True

subreddits = ['MexicoBursatil', 'MexicoFinanciero']
query = 'Acciones', 'BMV', 'Comprar', 'Vender', 'Alza', 'Baja'
reddit_posts = get_reddit_posts(subreddits, query)


In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = load_stopwords('spanish')
reddit_df = pd.DataFrame(reddit_posts)

reddit_df['Date'] = reddit_df['created'].dt.normalize()
reddit_df.drop('created', axis=1, inplace=True)

# compute sentiment
reddit_df['content_clean'] = reddit_df.apply(lambda x: preprocess_text(x['title'] + ' ' + x['content'], stop_words), axis=1)


In [None]:
news_df = pd.DataFrame(news_articles)
news_df.rename(columns={'date': 'Date'}, inplace=True)
news_df.drop('content', axis=1, inplace=True)

In [None]:

import argostranslate.package
import argostranslate.translate

from_code = "es"
to_code = "en"

argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

# Translate function
reddit_df['translated'] = reddit_df['content_clean'].apply(
    lambda x: argostranslate.translate.translate(x, from_code, to_code)
)

In [None]:
# Translate function
news_df['translated'] = news_df['title'].apply(
    lambda x: argostranslate.translate.translate(x, from_code, to_code)
)

In [None]:
# Get sentiment scores
news_df['sentiment_score'] = news_df['translated'].apply(get_sentiment)
reddit_df['sentiment_score'] = reddit_df['translated'].apply(get_sentiment)

In [None]:
news_df.rename(columns={'date': 'Date'}, inplace=True)
news_df["sentiment_score"] = news_df["sentiment_score"].apply(lambda x: x["score"] if isinstance(x, dict) else x)

desired_sentiment = ['Date', 'sentiment_score']
news_score_df = news_df[desired_sentiment]

In [None]:
reddit_df["sentiment_score"] = reddit_df["sentiment_score"].apply(lambda x: x["score"])
reddit_score_df = reddit_df[desired_sentiment]

In [None]:
trends_score_df = trends_data[desired_sentiment]

In [None]:
# Calculate stock volatility, momentum
stock_data['Returns'] = stock_data['Close'].pct_change()
stock_data['Volatility'] = stock_data['Returns'].rolling(window=30).std() * np.sqrt(30)

# Fill NAs
stock_data['Volatility'].fillna(stock_data['Volatility'].mean(), inplace=True)

stock_data['Volatility_90d_avg'] = stock_data['Volatility'].rolling(window=90).mean()
stock_data['Volatility_vs_90d_avg'] = stock_data['Volatility'] / stock_data['Volatility_90d_avg']
# Calculate volume

stock_data['Volume'] = stock_data['Volume'].astype(float)

stock_data['Volume_90d_avg'] = stock_data['Volume'].rolling(window=90).mean()
stock_data['Volume_vs_90d_avg'] = stock_data['Volume'] / stock_data['Volume_90d_avg']

# Calculate momentum

stock_data['Momentum'] = stock_data['Close'] - stock_data['Close'].shift(1)
stock_data['Momentum_90d_avg'] = stock_data['Momentum'].rolling(window=90).mean()
stock_data['Momentum_vs_90d_avg'] = stock_data['Momentum'] / stock_data['Momentum_90d_avg']

In [None]:
stock_desired = ['Date', 'Volatility_vs_90d_avg', 'Volume_vs_90d_avg', 'Momentum_vs_90d_avg']
stock_score_df = stock_data[stock_desired]
stock_score_df['Volume_vs_90d_avg'].fillna(stock_data['Volume_vs_90d_avg'].mean(), inplace=True)
stock_score_df['Momentum_vs_90d_avg'].fillna(stock_data['Momentum_vs_90d_avg'].mean(), inplace=True)
stock_score_df['Volatility_vs_90d_avg'].fillna(stock_data['Volatility_vs_90d_avg'].mean(), inplace=True)

In [None]:
def trends_to_value(score):
    if score == 2:
        return 1
    elif score == 1:
        return 1
    else:
        return 0

# Apply sentiment conversion functions
trends_score_df['sentiment_score'] = trends_score_df['sentiment_score'].apply(trends_to_value)

In [None]:
def filter_df(df, dates):
    df["Date"] = pd.to_datetime(df["Date"])
    dates["Date"] = pd.to_datetime(dates["Date"])
    
    df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
    dates["Date"] = dates["Date"].dt.strftime("%Y-%m-%d")

    filtered_df = df[df["Date"].isin(dates["Date"])]
    
    filtered_df = pd.merge(filtered_df, dates, on="Date", how="outer")

    filtered_df = filtered_df.sort_values(by="Date").reset_index(drop=True)
    
    return filtered_df


In [None]:
dates_df = pd.DataFrame({
    "Date": stock_score_df["Date"]
})

filtered_news = filter_df(news_score_df, dates_df)
filtered_reddit = filter_df(reddit_score_df, dates_df)
filtered_trends = filter_df(trends_score_df, dates_df)


In [None]:
# Grouping by 'Date'
trends_score_grouped = filtered_trends.groupby('Date')['sentiment_score'].mean().reset_index()
news_score_grouped = filtered_news.groupby('Date')['sentiment_score'].mean().reset_index()
reddit_score_grouped = filtered_reddit.groupby('Date')['sentiment_score'].mean().reset_index()


In [None]:
trends_score_grouped['sentiment_score'].fillna(trends_score_grouped['sentiment_score'].mean(), inplace=True)
news_score_grouped['sentiment_score'].fillna(news_score_grouped['sentiment_score'].mean(), inplace=True)
reddit_score_grouped['sentiment_score'].fillna(reddit_score_grouped['sentiment_score'].mean(), inplace=True)

In [None]:
# Merge all datasets

# Merge news and reddit sentiment

sentiment_data = pd.merge(
    news_score_grouped,
    reddit_score_grouped,
    on='Date',
    how='outer',
    suffixes=('_news', '_reddit')
)
# Then merge with trends
sentiment_data = pd.merge(
    sentiment_data,
    trends_score_grouped,
    on='Date',
    how='outer'
)

sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])

# Merge financial data with sentiment data

sentiment_data = pd.merge(
    sentiment_data,
    stock_score_df,
    on='Date',
    how='outer'
)


In [None]:
# Scale stock dataa

from sklearn.preprocessing import MinMaxScaler

features_to_normalize = [
    'Volatility_vs_90d_avg',
    'Volume_vs_90d_avg',
    'Momentum_vs_90d_avg',
    'sentiment_score_news',
    'sentiment_score_reddit',
    'sentiment_score',
]

scaler = MinMaxScaler()

sentiment_data[features_to_normalize] = scaler.fit_transform(sentiment_data[features_to_normalize])


In [None]:
weights = {
    'sentiment_score_news': 0.15,
    'sentiment_score_reddit': 0.15,
    'sentiment_score': 0.10,
    'Volatility_vs_90d_avg': 0.25,
    'Volume_vs_90d_avg': 0.175,
    'Momentum_vs_90d_avg': 0.175,
}

In [None]:
features = list(weights.keys())


In [None]:
def calculate_sentiment_index(row, weights):
    sentiment_index = 0
    for feature, weight in weights.items():
        # Directly use the feature value since NaN is already handled
        sentiment_index += row[feature] * weight
    return sentiment_index * 100  # Scale to 0-1

sentiment_data['Sentiment'] = sentiment_data.apply(lambda row: calculate_sentiment_index(row, weights), axis=1)

sentiment_index=  sentiment_data[['Date', 'Sentiment']]
print(sentiment_index)

In [None]:
sentiment_index['Sentiment'] = sentiment_index['Sentiment'].ewm(span=7, adjust=False).mean()


In [None]:
sentiment_data['ema_sentiment'] = sentiment_data['Sentiment'].ewm(span=7, adjust=False).mean()
display(sentiment_data)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(sentiment_data['Date'], sentiment_data['ema_sentiment'], label='90-Day Moving Average', linestyle='--')
plt.title('Sentiment vs. 90-Day Moving Average')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
sentiment_index.to_csv('sentiment_index.csv', index=False)