<a href="https://colab.research.google.com/github/KhyatiGaurana/StockPulse/blob/main/MarketAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install transformers
# !pip install sentencepiece
# !pip install torch

In [2]:
#  import nltk
#  nltk.download('punkt')

In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [4]:
model_name="human-centered-summarization/financial-summarization-pegasus"
tokenizer=PegasusTokenizer.from_pretrained(model_name)
model=PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
stock_tickers=['HAL','POLYCAB','RELIANCE']

In [6]:
def search_for_stock_news(ticker):
    stocks_url=f"https://www.google.com/search?q=moneycontrol+news+{ticker}&tbm=nws"
    req=requests.get(stocks_url)
    soup=BeautifulSoup(req.text, 'html.parser')
    anchors=soup.find_all('a')
    links=[link['href'] for link in anchors]
    return links

In [7]:
urls={ticker:search_for_stock_news(ticker) for ticker in stock_tickers}

In [8]:
# urls

In [9]:
import re

In [10]:
excluded_words=['maps', 'policies', 'preferences', 'accounts', 'support', 'search']

In [11]:
def remove_unwanted_urls(urls, excluded_words):
    value=[]
    for url in urls:
        if 'https://' in url and not any(word in url for word in excluded_words):
            res=re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            value.append(res)
    return list(set(value))


In [12]:
# cleaned_urls={ticker: remove_unwanted_urls(urls[ticker], excluded_words) for ticker in stock_tickers}

In [13]:
from nltk.tokenize import word_tokenize

def scrape_urls(urls):
    articles = []
    for url in urls:
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([paragraph.text for paragraph in paragraphs])
        words = word_tokenize(text)
        article = ' '.join(words)
        articles.append(article)
    return articles

In [14]:
# articles={ticker:scrape_urls(cleaned_urls[ticker]) for ticker in stock_tickers}

In [15]:
# len(articles['HAL'][1][:350])

In [16]:
def summarize_articles(articles):
    summaries=[]
    for single_article in articles:
        article=single_article[:350]
        input_tokens=tokenizer.encode(article, return_tensors='pt')
        output=model.generate(input_tokens, max_length=100, num_beams=5, early_stopping=True)
        summary=tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [17]:
# summaries={ticker:summarize_articles(articles[ticker]) for ticker in stock_tickers}

In [18]:
# summaries

In [19]:
from transformers import pipeline
sentiment=pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [20]:
# scores={ticker:sentiment(summaries[ticker]) for ticker in stock_tickers}

In [21]:
# scores

In [22]:
def create_output(stock_tickers, summaries, scores, cleaned_urls):
    output={}
    for ticker in stock_tickers:
        output[ticker] = []
        for counter in range(len(summaries[ticker])):
            curr_output=[
                cleaned_urls[ticker][counter],
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
            ]
            output[ticker].append(curr_output)
    return output


In [23]:
# final_output=create_output(stock_tickers, summaries, scores, cleaned_urls)
# final_output

In [24]:
# import csv
# with open('stockSummaries.csv', mode='w', newline='') as f:
#     csv_writer=csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     csv_writer.writerows(final_output)

In [25]:
# !pip install fastapi nest-asyncio pyngrok uvicorn

In [26]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

@app.get('/')
async def analyze_stocks_handler():  # Get stock symbols from query parameter
    stock_tickers = ['HAL']  # Split comma-separated symbols into a list
    urls={ticker:search_for_stock_news(ticker) for ticker in stock_tickers}
    excluded_words=['maps', 'policies', 'preferences', 'accounts', 'support', 'search']
    cleaned_urls={ticker: remove_unwanted_urls(urls[ticker], excluded_words) for ticker in stock_tickers}
    articles={ticker:scrape_urls(cleaned_urls[ticker]) for ticker in stock_tickers}
    summaries={ticker:summarize_articles(articles[ticker]) for ticker in stock_tickers}
    scores={ticker:sentiment(summaries[ticker]) for ticker in stock_tickers}
    final_output=create_output(stock_tickers, summaries, scores, cleaned_urls)
    return final_output



In [27]:
import getpass

from pyngrok import ngrok, conf

print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = getpass.getpass()

# Open a TCP ngrok tunnel to the SSH server
connection_string = ngrok.connect("22", "tcp").public_url

ssh_url, port = connection_string.strip("tcp://").split(":")
print(f" * ngrok tunnel available, access with `ssh root@{ssh_url} -p{port}`")

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
··········
 * ngrok tunnel available, access with `ssh root@0.tcp.ngrok.io -p16082`


In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

Public URL: https://c607-34-148-177-13.ngrok-free.app


INFO:     Started server process [65929]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     106.208.67.20:0 - "OPTIONS / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "OPTIONS / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK




INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "GET / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "OPTIONS / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "OPTIONS / HTTP/1.1" 200 OK
INFO:     106.208.67.20:0 - "OPTIONS / HTTP/1.1" 200 OK
