In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline

# Step 1: Scrape mining companies' data
def get_mining_companies():
    url = "https://en.wikipedia.org/wiki/List_of_mining_companies"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    companies = []

    # Example: Scrape table rows containing company names and descriptions
    for row in soup.select("table.wikitable tr")[1:]:  # Adjust selector based on the website structure
        cells = row.find_all("td")
        if len(cells) >= 2:
            company = cells[0].text.strip()
            description = cells[1].text.strip()
            companies.append({"Company": company, "Description": description})

    return pd.DataFrame(companies)

# Step 2: NLP for summarizing company information
def summarize_reports(companies):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    top_companies = []
    for index, row in companies.iterrows():
        if index >= 10:  # Limit to top 10 companies for summarization
            break
        # Placeholder: Replace this with actual scraped annual reports
        annual_report_text = f"Detailed insights about {row['Company']} in the mining sector. Placeholder text."
        summary = summarizer(annual_report_text, max_length=50, min_length=25, do_sample=False)
        top_companies.append({
            "Company": row["Company"],
            "Description": row["Description"],
            "Summary": summary[0]['summary_text']
        })

    return top_companies

# Step 3: Display and save the data
def main():
    print("Scraping mining companies data...")
    companies_df = get_mining_companies()

    print("Summarizing annual reports for top companies...")
    top_companies = summarize_reports(companies_df)

    # Save the full list to a CSV
    companies_df.to_csv("mining_companies.csv", index=False)

    # Display top companies in a table format
    top_companies_df = pd.DataFrame(top_companies)
    print(top_companies_df)

    # Save top companies data to a CSV
    top_companies_df.to_csv("top_mining_companies.csv", index=False)

if __name__ == "__main__":
    main()


Scraping mining companies data...
Summarizing annual reports for top companies...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


Empty DataFrame
Columns: []
Index: []
