In [9]:
import requests
import json
from bs4 import BeautifulSoup

def read_news_article(url):
    
    response = None

    try:
        response = requests.get(url, timeout=5)
        # Check if the request was successful (status code 200 means success).
        if response.status_code == 200:
            pass
        else:
            pass
    except requests.Timeout:
        return ''
    except requests.exceptions.RequestException as e:
        return ''
    except requests.exceptions.JSONDecodeError as e:
        return ''
    except json.JSONDecodeError as e:
        return ''
    except requests.RequestException as e:
        return ''

    if (response is None):
        return ''
    
    soup = BeautifulSoup(response.content, 'html.parser')

    article_tags = [
        {'tag': 'div', 'attrs': {'class': 'article-content'}},
        {'tag': 'div', 'attrs': {'class': 'caas-body'}},
        {'tag': 'article'},
        {'tag': 'section'},
        {'tag': 'main'},
        {'tag': 'p'},
        {'tag': 'div', 'attrs': {'class': 'entry-content'}},
        {'tag': 'div', 'attrs': {'class': 'content'}},
        {'tag': 'div', 'attrs': {'class': 'body'}},
        {'tag': 'div', 'attrs': {'id': 'content'}},
        {'tag': 'div', 'attrs': {'class': 'article'}},
        {'tag': 'section', 'attrs': {'class': 'article-body'}},
        {'tag': 'section', 'attrs': {'class': 'content'}},
        {'tag': 'section', 'attrs': {'class': 'entry'}},
        {'tag': 'main', 'attrs': {'class': 'article-content'}},
        {'tag': 'main', 'attrs': {'class': 'content'}},
        {'tag': 'main', 'attrs': {'id': 'main-content'}},
    ]

    article_text = ""
    for tag_info in article_tags:
        tag = tag_info['tag']
        attrs = tag_info.get('attrs', {})

        elements = soup.find_all(tag, attrs=attrs)
        for element in elements:
            article_text += element.get_text() + "\n"

    return article_text

In [10]:
import requests

def is_grammatically_correct(sentence):
    url = "https://api.languagetool.org/v2/check"

    # Parameters for the API request
    payload = {
        "text": sentence,
        "language": "en-US",
    }

    # Make the API request
    response = requests.post(url, data=payload)
    try:
        data = response.json()
    except json.JSONDecodeError as e:
        return False

    # Check if any errors were found
    if "matches" in data:
        return len(data["matches"]) == 0

    return True  # Return True if no errors found or API response is unexpected

In [11]:
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('vader_lexicon')

def find_sentences_with_keyword(text, keywords):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    sentences_with_keyword = []

    for sentence in sentences:
        if keywords.lower() in sentence.lower():
            sentence = sentence.replace("\n", "")
            sentences_with_keyword.append(sentence)

    sentences_processed = check_grammar(sentences_with_keyword)
    return sentences_processed

def check_grammar(sentences):
    verified_sentences = []
    for i in range (len(sentences)):
        sentence = sentences[i]
        if (is_grammatically_correct(sentence)):
            verified_sentences.append(sentence)
    return verified_sentences

def perform_sentiment_analysis(sentence):
    sid = SentimentIntensityAnalyzer()
    
    sentiment_scores = sid.polarity_scores(sentence)
    
    return categorize_sentiment(sentiment_scores['compound'])

def categorize_sentiment(sentiment_score):
  if sentiment_score >= 0.75:
    return 4
  elif 0.75 > sentiment_score >= 0.5:
    return 3
  elif 0.5 > sentiment_score >= 0.25:
    return 2
  elif 0.25 > sentiment_score >= 0.05:
    return 1
  elif 0.05 > sentiment_score >= -0.05:
    return 0
  elif -0.05 > sentiment_score >= -0.25:
    return -1
  elif -0.25 > sentiment_score >= -0.5:
    return -2
  elif -0.5 > sentiment_score >= -0.75:
    return -3
  elif -0.75 > sentiment_score:
    return -4

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hello\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [12]:
import re
import random

def find_keywords_in_articles(articles, keywords):
    sentiment_score = [0]*9
    for article_url in articles:
        article_content = read_news_article(article_url)
        for keyword in keywords:
            if re.search(r'\b{}\b'.format(keyword), article_content, re.IGNORECASE):
                target_sentence = find_sentences_with_keyword(article_content, keyword)
                if len(target_sentence) > 15:
                    target_sentence = random.sample(target_sentence, 15)
                for sentence in target_sentence:
                    compound_score = perform_sentiment_analysis(sentence)
                    sentiment_score[compound_score+4] += 1
    return sentiment_score

def mainfunction(urls, combined_terms, date):
    combined_terms += ['Semiconductor']
    sentiment_list = find_keywords_in_articles(urls, combined_terms)

    appended_string = date
    for sentiment in sentiment_list:
        appended_string += "," + str(sentiment)
    
    append_file("newsdata.csv", appended_string)

In [13]:
from pygooglenews import GoogleNews
import random
from datetime import datetime, timedelta

# default GoogleNews instance
gn = GoogleNews(lang = 'en', country = 'US')

startDatetime = datetime.strptime('2023-08-21', "%Y-%m-%d")
endDatetime = datetime.strptime('2023-08-22', "%Y-%m-%d")
startDate = startDatetime.strftime("%Y-%m-%d")
endDate = endDatetime.strftime("%Y-%m-%d")

while (startDate != datetime.now().date().strftime("%Y-%m-%d")):

    combined_terms = ["Applied Materials"]
    articleurls = []

    articles_amat = gn.search('"Applied Materials"', from_ = startDate, to_ = endDate)
    for i in range(len(articles_amat['entries'])):
        articleurls.append(articles_amat['entries'][i]['link'])

    articles_semi = gn.search('"Semiconductor"', from_ = startDate, to_ = endDate)
    for i in range(len(articles_semi['entries'])):
        articleurls.append(articles_semi['entries'][i]['link'])

    if (len(articleurls) > 18):
        articleurls = random.sample(articleurls, 18)

    mainfunction(articleurls, combined_terms, startDate)

    startDatetime = startDatetime + timedelta(days=1)
    endDatetime = endDatetime + timedelta(days=1)
    startDate = startDatetime.strftime("%Y-%m-%d")
    endDate = endDatetime.strftime("%Y-%m-%d")

    print(startDate, endDate)

2023-08-22 2023-08-23


In [14]:
#append text to a file

def append_file(name, text):
  file = open(name, "a")  # Open file in append mode
  file.writelines(''.join(text))  # Append content to the file
  file.write("\n")
  file.close()

In [15]:
#read text from a file

def read_file(name):
  file = open(name, "r")  # Open file in read mode
  content = file.read()  # Read the entire content of the file
  file.close()
  return content

In [16]:
#create a new file

def create_file(name):
  file = open(name, "w")  # Open file in write mode
  file.write("")
  file.close()