In [9]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

global_sector_mapping = {
    "Technology": ["Apple", "Microsoft", "Samsung", "TSMC", "Sony", "SAP", "Tencent", "Alibaba", "Infosys"],
    "Consumer Discretionary": ["Amazon", "Tesla", "Toyota", "Nike", "JD.com", "Starbucks", "Booking.com"],
    "Healthcare": ["Pfizer", "Roche", "Sanofi", "AstraZeneca", "Novartis", "Johnson & Johnson"],
    "Financials": ["JPMorgan", "HSBC", "Goldman Sachs", "ICICI Bank", "UBS", "BNP Paribas", "DBS"],
    "Energy": ["ExxonMobil", "Shell", "BP", "TotalEnergies", "Saudi Aramco", "PetroChina", "Reliance"],
    "Industrials": ["Siemens", "GE", "Hitachi", "Caterpillar", "Schneider Electric", "Mitsubishi"],
    "Utilities": ["Enel", "Iberdrola", "NextEra", "EDF", "NTPC"],
    "Real Estate": ["Prologis", "Mitsui Fudosan", "Dexus", "Vonovia"],
    "Consumer Staples": ["Nestlé", "P&G", "Unilever", "Walmart", "Coca-Cola", "PepsiCo", "Colgate-Palmolive"],
    "Communication Services": ["Alphabet", "Meta", "Netflix", "Verizon", "China Mobile", "SoftBank", "SK Telecom"],
    "Materials": ["BHP", "Rio Tinto", "Vale", "ArcelorMittal", "Sibanye-Stillwater", "BASF"],
    "Others / Small Cap": ["Zomato", "Palantir", "Robinhood", "Unknown", "Smaller regional firms"]
}

def map_to_sector(text):
    text = text.lower()
    for sector, companies in global_sector_mapping.items():
        for company in companies:
            if re.search(rf"\\b{re.escape(company.lower())}\\b", text):
                return sector
    return None


In [10]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()
finnhub_api_key = os.getenv("FINNHUB_API_KEY")  # Add your key to a .env file


In [11]:
symbol_mapping = {
    "Apple": "AAPL", "Microsoft": "MSFT", "Amazon": "AMZN", "Tesla": "TSLA",
    "Pfizer": "PFE", "JPMorgan": "JPM", "Alphabet": "GOOGL", "Meta": "META",
    "Johnson & Johnson": "JNJ", "Goldman Sachs": "GS", "ExxonMobil": "XOM",
    "Shell": "SHEL", "Walmart": "WMT", "Coca-Cola": "KO", "Netflix": "NFLX"
    # Add more as needed
}

def get_symbol(company):
    return symbol_mapping.get(company)


In [12]:

vader = SentimentIntensityAnalyzer()

finbert = pipeline(
    "text-classification",
    model="yiyanghkust/finbert-tone",
    tokenizer=AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone"),
    device=-1
)

def get_vader_score(text):
    return vader.polarity_scores(text)['compound']

def get_finbert_score(text):
    result = finbert(text[:512])[0]
    return result['score'] * (1 if result['label'] == 'Positive' else -1)


Device set to use cpu


In [13]:
from datetime import datetime, timedelta

def fetch_company_news(symbol):
    today = datetime.today().strftime('%Y-%m-%d')
    week_ago = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d')

    url = f"https://finnhub.io/api/v1/company-news?symbol={symbol}&from={week_ago}&to={today}&token={finnhub_api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch news for {symbol}")
        return []
    return response.json()


In [14]:
news_data = []

for sector, companies in global_sector_mapping.items():
    for company in companies:
        symbol = get_symbol(company)
        if not symbol:
            continue
        
        articles = fetch_company_news(symbol)
        for article in articles:
            headline = article.get("headline", "")
            if headline:
                news_data.append({
                    "company": company,
                    "sector": sector,
                    "symbol": symbol,
                    "headline": headline,
                    "vader_score": get_vader_score(headline),
                    "finbert_score": get_finbert_score(headline),
                    "datetime": article.get("datetime")
                })


In [15]:
df = pd.DataFrame(news_data)
df.to_csv("finnhub_company_news_sentiment.csv", index=False)
print("Saved to finnhub_company_news_sentiment.csv")
df.head()


Saved to finnhub_company_news_sentiment.csv


Unnamed: 0,company,sector,symbol,headline,vader_score,finbert_score,datetime
0,Apple,Technology,AAPL,Impax U.S. Sustainable Economy Fund Q1 2025 Co...,0.0,-0.99997,1750134720
1,Apple,Technology,AAPL,"Goldman Sachs Recommends Buying Apple, Expects...",0.2263,0.995852,1750134474
2,Apple,Technology,AAPL,Apple (AAPL) Gets $235 Price Target from Morga...,0.1027,-0.861908,1750116769
3,Apple,Technology,AAPL,New Buffett-Inspired ETF Holds Berkshire and A...,0.0,-0.999601,1750112280
4,Apple,Technology,AAPL,Tracking Cliff Asness' AQR Capital Management ...,0.0,-0.998702,1750108183


In [16]:
sector_sentiment = df.groupby("sector")[["vader_score", "finbert_score"]].mean().sort_values("finbert_score", ascending=False)

print("\nAverage Sentiment Scores by Sector (sorted):")
print(sector_sentiment)

sector_sentiment.to_csv("sector_sentiment_summary.csv")



Average Sentiment Scores by Sector (sorted):
                        vader_score  finbert_score
sector                                            
Energy                     0.071269      -0.303653
Healthcare                 0.133605      -0.343090
Technology                 0.090432      -0.452394
Consumer Discretionary     0.085194      -0.494001
Financials                 0.037813      -0.500181
Consumer Staples           0.086438      -0.543182
Communication Services     0.073658      -0.564581
