In [None]:
import requests
import pandas as pd
from datetime import date, timedelta
import gspread
from google.oauth2.service_account import Credentials
from bs4 import BeautifulSoup

# API Key for NewsAPI
API_KEY = 'a2ba8940e94440a482a79029f460d29c' #'679059ca4e8e4e08b6ae2ddfd3b2bc9b'

# Cybersecurity-specific news sources
CYBERSECURITY_SOURCES = [
    'wired', 'techcrunch', 'engadget', 'ars-technica', 'the-verge',
    'bloomberg', 'hacker-news', 'recode', 'next-big-future',
    'bbc-news', 'cnn', 'reuters', 'the-wall-street-journal',
    'techradar', 'business-insider', 'the-washington-post',
    'new-scientist', 'fortune', 'les-echos', 'handelsblatt',
    'wirtschafts-woche', 'xinhua-net', 'financial-post',
    'axios', 'crypto-coins-news', 'mashable', 'the-next-web',
    'the-times-of-india', 'politico', 'vice-news',
    'national-review', 'the-hindu', 'google-news-in',
    'cbc-news', 'fox-news', 'le-monde', 'rt', 'google-news'
]

# Define the date range
today = date.today()
thirty_days_ago = today - timedelta(days=30)

# List of keywords for cybersecurity-related topics
KEYWORDS = ['cybersecurity', 'data breach', 'ransomware', 'APT', 'cyber attack', 'malware', 'phishing', 'DDoS']

# Function to fetch cybersecurity articles for each keyword
# Define the date range with an inclusive 'to' date
today = date.today()
thirty_days_ago = today - timedelta(days=30)
inclusive_today = today + timedelta(days=1)  # To include articles published today

# Updated function to fetch articles with an inclusive date range
def fetch_cybersecurity_articles_by_keywords(channels, keywords, days):
    all_articles = []
    cutoff_date = today - timedelta(days=days)

    for keyword in keywords:
        for source in channels:
            url = (f'https://newsapi.org/v2/everything?q={keyword}&sources={source}'
                   f'&from={cutoff_date}&to={inclusive_today}'  # Adjusted to include today's articles
                   f'&sortBy=relevancy&language=en&apiKey={API_KEY}')
            response = requests.get(url)
            if response.status_code == 200:
                articles = response.json().get('articles', [])
                for article in articles:
                    all_articles.append({
                        'Date': article.get('publishedAt')[:10],
                        'Headline': article.get('title'),
                        'Description': article.get('description'),
                        'Key Highlights': article.get('content', '')[:200],
                        'Link': article.get('url'),
                        'Source Name': article.get('source', {}).get('name', 'Unknown Source'),
                        'Blog': '',
                        'Blog Writer': ''
                    })
            else:
                print(f"Failed to fetch from {source} for keyword {keyword}: {response.status_code} - {response.json().get('message', 'No error message available')}")
    return all_articles

# Filter articles to ensure they are within the desired 30-day range before saving
df = pd.DataFrame(combined_articles, columns=[
    'Date', 'Headline', 'Description', 'Key Highlights', 'Link', 'Source Name', 'Blog', 'Blog Writer'
])

# Convert 'Date' column to datetime and filter by the last 30 days
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[(df['Date'] >= pd.to_datetime(thirty_days_ago)) & (df['Date'] <= pd.to_datetime(today))]

# Drop rows with NaN values in the 'Date' column and sort by Date
df.dropna(subset=['Date'], inplace=True)
df = df.sort_values(by='Date', ascending=True)


# Scrape GB Hackers
"""
def scrape_gbhackers():
    url = 'https://gbhackers.com/'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    gbhackers_articles = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('div', class_='td-module-container')

        for article in articles:
            title_tag = article.find('h3')
            if title_tag:
                link_tag = title_tag.find('a')
                title = title_tag.get_text(strip=True) if title_tag else 'No Title'
                link = link_tag['href'] if link_tag else 'No Link'
                description_tag = article.find('div', class_='td-excerpt')
                description = description_tag.get_text(strip=True) if description_tag else "No description available"

                gbhackers_articles.append({
                    'Date': today.strftime('%Y-%m-%d'),
                    'Headline': title,
                    'Description': description,
                    'Key Highlights': description[:200],
                    'Link': link,
                    'Source Name': 'GBHackers',
                    'Blog': '',
                    'Blog Writer': ''
                })
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

    return gbhackers_articles
"""
# Scrape articles from The Cyber Express
"""
def scrape_cyber_express():
    url = 'https://thecyberexpress.com/'
    response = requests.get(url)
    article_data = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('article', class_='jeg_post')

        for article in articles:
            try:
                title_tag = article.find('h3', class_='jeg_post_title')
                title = title_tag.text.strip() if title_tag else 'No Title'
                link_tag = title_tag.find('a')
                article_url = link_tag['href'] if link_tag else 'No URL'
                description_tag = article.find('div', class_='jeg_post_excerpt')
                description = description_tag.text.strip() if description_tag else 'No Description'
                date_tag = article.find('div', class_='jeg_meta_date')
                pub_date = date_tag.text.strip() if date_tag else today.strftime('%Y-%m-%d')
                author_tag = article.find('div', class_='jeg_meta_author')
                author = author_tag.text.strip() if author_tag else 'No Author'

                article_data.append({
                    'Date': pub_date,
                    'Headline': title,
                    'Description': description,
                    'Key Highlights': '',
                    'Link': article_url,
                    'Source Name': 'The Cyber Express',
                    'Blog': '',
                    'Blog Writer': author
                })
            except Exception as e:
                print(f"Error while processing article: {e}")

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    return article_data
"""

# Scrape articles from Kaspersky Labs
"""
def scrape_kaspersky_labs():
    url = "https://www.kaspersky.com/blog"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)

    kaspersky_articles = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('article', class_='c-card')

        for article in articles:
            try:
                title_tag = article.find('h3', class_='c-card__title')
                title = title_tag.text.strip() if title_tag else 'No title found'
                link_tag = title_tag.find('a') if title_tag else None
                link = link_tag['href'] if link_tag and 'href' in link_tag.attrs else 'No link found'
                description_tag = article.find('div', class_='c-card__desc')
                description = description_tag.text.strip() if description_tag else 'No description found'

                kaspersky_articles.append({
                    'Date': today.strftime('%Y-%m-%d'),
                    'Headline': title,
                    'Description': description,
                    'Key Highlights': description[:200],
                    'Link': link,
                    'Source Name': 'Kaspersky Labs',
                    'Blog': '',
                    'Blog Writer': ''
                })
            except Exception as e:
                print(f"Error while processing article: {e}")

    else:
        print(f"Failed to retrieve content. Status Code: {response.status_code}")

    return kaspersky_articles
"""
# Scrape articles from SentinelOne Labs
"""
def scrape_sentinelone_labs():
    url = 'https://www.sentinelone.com/labs/'
    response = requests.get(url)
    sentinelone_articles = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('article')

        for article in articles:
            try:
                title = article.find('h2')
                title_text = title.text.strip() if title else 'No Title'
                link = article.find('a', href=True)
                article_link = link['href'] if link else 'No Link'
                description = article.find('p').text.strip() if article.find('p') else 'No Description'

                sentinelone_articles.append({
                    'Date': today.strftime('%Y-%m-%d'),
                    'Headline': title_text,
                    'Description': description,
                    'Key Highlights': description[:200],
                    'Link': article_link,
                    'Source Name': 'SentinelOne Labs',
                    'Blog': '',
                    'Blog Writer': ''
                })
            except Exception as e:
                print(f"Error while processing article: {e}")

    else:
        print(f"Failed to retrieve page with status code {response.status_code}")

    return sentinelone_articles
"""

# Combine all articles from different sources
def combine_articles():
    articles_from_api = fetch_cybersecurity_articles_by_keywords(CYBERSECURITY_SOURCES, KEYWORDS, 30)
   #articles_from_gbhackers = scrape_gbhackers()
    #articles_from_cyber_express = scrape_cyber_express()
    #articles_from_kaspersky = scrape_kaspersky_labs()
   # articles_from_sentinelone = scrape_sentinelone_labs()

    all_articles = (articles_from_api) #+ articles_from_gbhackers + articles_from_cyber_express +
                    #articles_from_kaspersky + articles_from_sentinelone)

    # Debug: Print out the number of articles fetched from each source
    print(f"Fetched {len(articles_from_api)} articles from NewsAPI.")
    """
    print(f"Fetched {len(articles_from_gbhackers)} articles from GBHackers.")
    print(f"Fetched {len(articles_from_cyber_express)} articles from The Cyber Express.")
    print(f"Fetched {len(articles_from_kaspersky)} articles from Kaspersky Labs.")
    print(f"Fetched {len(articles_from_sentinelone)} articles from SentinelOne Labs.")
    """

    return all_articles


# Validate articles using specified keywords
# Validate articles using specified keywords
def validate_articles(articles):
    validated_articles = []
    keywords = "|".join(KEYWORDS).lower()  # Use KEYWORDS instead of validation_keywords

    for article in articles:
        # Allow articles that match any keyword in the headline or description
        if pd.notnull(article['Headline']) and pd.notnull(article['Description']):
            text = f"{article['Headline']} {article['Description']}".lower()
            if any(kw.lower() in text for kw in KEYWORDS):
                validated_articles.append(article)

    return validated_articles


# Save data to a CSV file to maintain records
def save_to_csv(df, csv_filename):
    # Append data to the CSV file (or create it if it doesn't exist)
    try:
        existing_df = pd.read_csv(csv_filename)  # Read existing data
        df = pd.concat([existing_df, df]).drop_duplicates(subset=['Link']).reset_index(drop=True)  # Combine and drop duplicates
    except FileNotFoundError:
        pass  # If the file doesn't exist, just proceed to save the new data

    # Save the combined data to CSV
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

# Run the scraper
combined_articles = combine_articles()

# Validate the articles
validated_articles = validate_articles(combined_articles)

# Create a DataFrame from the validated articles
df = pd.DataFrame(validated_articles, columns=[
    'Date', 'Headline', 'Description', 'Key Highlights', 'Link', 'Source Name', 'Blog', 'Blog Writer'
])

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Filter articles from the past 30 days
df = df[df['Date'] >= pd.to_datetime(thirty_days_ago)]

# Drop rows with NaN values in the 'Date' column
df.dropna(subset=['Date'], inplace=True)

# Sort the DataFrame by Date in ascending order
df = df.sort_values(by='Date', ascending=True)

# Merge the 'Date' column for articles on the same date
last_date = None
for i, article in df.iterrows():
    if article['Date'].date() == last_date:
        df.at[i, 'Date'] = ''  # Empty string for merged dates
    else:
        last_date = article['Date'].date()  # Update last_date to the current article's date

# Save the DataFrame to CSV to persist data
csv_filename = 'cybersecurity_articles.csv'
save_to_csv(df, csv_filename)

# Convert 'Date' column to string format for Google Sheets
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Authenticate and write to Google Sheets
def update_google_sheet(df, sheet_name):
    SCOPES = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"]

    # Load credentials from the service account file
    creds = Credentials.from_service_account_file('credentials.json', scopes=SCOPES)

    # Authenticate and open the sheet
    client = gspread.authorize(creds)
    sheet = client.open(sheet_name).sheet1  # Assuming you're writing to the first sheet

    # Ensure all values are serializable
    df.fillna('', inplace=True)  # Replace NaN with empty strings

    # Convert the DataFrame to a list of lists
    data = [df.columns.values.tolist()] + df.values.tolist()

    # Clear the sheet and insert the new data
    sheet.clear()
    sheet.insert_rows(data, 1)  # Insert starting at the first row

# Update your Google Sheet
update_google_sheet(df, 'NA_new')
