# 🏛️ ASX Materials Companies Scraper  
### Scrape and filter ASX-listed companies for materials and gold miners

---

### ⚙️ Summary
This script:
- Scrapes the **ASX Listed Companies CSV**  
- Extracts and cleans company details (ASX code, name, GICS group)  
- Filters for **materials sector** companies  
- Fetches company info from **Yahoo Finance**  
- Detects **gold-related companies** via text matching  
- Saves results to both **CSV** and **JSON**

**Output:**  
- `asx_materials_companies.csv`  
- `asx_materials_companies.json`

---

### Imports and Setup

In [None]:
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import List, Dict, Optional
import re

### Function to Fetch ASX Companies

In [None]:
def get_asx_companies() -> List[Dict]:
    """
    Scrape ASX company list from ASX website.
    Returns a list of dictionaries with company info.
    """
    print("Fetching ASX company list...")
    
    url = "https://www.asx.com.au/asx/research/ASXListedCompanies.csv"
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        # Save to temporary file and read with pandas
        with open('temp_asx_companies.csv', 'wb') as f:
            f.write(response.content)
        
        # Try different approaches to read the CSV
        df = None
        
        # First, let's examine the file structure
        print("Examining CSV file structure...")
        with open('temp_asx_companies.csv', 'r', encoding='utf-8') as f:
            first_lines = [f.readline().strip() for _ in range(5)]
            print("First 5 lines of CSV:")
            for i, line in enumerate(first_lines):
                print(f"Line {i}: {line[:100]}...")  # Show first 100 chars
        
        # Try reading with different header positions
        for header_row in [0, 1, 2, 3]:
            try:
                print(f"Trying header at row {header_row}...")
                df = pd.read_csv('temp_asx_companies.csv', header=header_row)
                print(f"Columns found: {list(df.columns)}")
                
                # Check if we have the expected columns (flexible matching)
                columns = [col.strip() for col in df.columns]
                
                # Look for company name column
                company_col = None
                for col in columns:
                    if 'company' in col.lower() and 'name' in col.lower():
                        company_col = col
                        break
                
                # Look for ASX code column
                code_col = None
                for col in columns:
                    if 'asx' in col.lower() and 'code' in col.lower():
                        code_col = col
                        break
                
                # Look for GICS industry column
                gics_col = None
                for col in columns:
                    if 'gics' in col.lower() and 'industry' in col.lower():
                        gics_col = col
                        break
                
                if company_col and code_col and gics_col:
                    print(f"✓ Found valid structure at header row {header_row}")
                    print(f"Company column: '{company_col}'")
                    print(f"Code column: '{code_col}'")
                    print(f"GICS column: '{gics_col}'")
                    
                    # Rename columns for consistency
                    df = df.rename(columns={
                        company_col: 'company_name',
                        code_col: 'asx_code',
                        gics_col: 'gics_industry_group'
                    })
                    break
                    
            except Exception as e:
                print(f"Failed with header row {header_row}: {e}")
                continue
        
        if df is None:
            raise Exception("Could not parse CSV with any header configuration")
        
        # Clean up the dataframe
        df = df.dropna(subset=['company_name', 'asx_code'])
        
        companies = []
        for _, row in df.iterrows():
            try:
                companies.append({
                    'name': str(row['company_name']).strip(),
                    'code': str(row['asx_code']).strip(),
                    'gics_industry_group': str(row['gics_industry_group']).strip() if pd.notna(row['gics_industry_group']) else 'Unknown'
                })
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        
        print(f"Found {len(companies)} ASX companies")
        
        # Clean up temp file
        import os
        try:
            os.remove('temp_asx_companies.csv')
        except:
            pass
            
        return companies
        
    except Exception as e:
        print(f"Error fetching ASX companies: {e}")
        print("You can try downloading the CSV manually from:")
        print("https://www.asx.com.au/asx/research/ASXListedCompanies.csv")
        return []

### Helper Functions

In [None]:
def is_materials_company(gics_industry_group: str) -> bool:
    """
    Determine if a company is in the materials sector based on GICS industry group.
    """
    return gics_industry_group.strip().lower() == 'materials'

def is_gold_company(business_summary: str) -> bool:
    """
    Determine if a company is a gold company based on business summary.
    Uses careful word boundary matching to avoid false positives like 'goldfields'.
    """
    if not business_summary:
        return False
    
    # Convert to lowercase for case-insensitive matching
    summary_lower = business_summary.lower()
    
    # Pattern to match 'gold' as a standalone word
    gold_pattern = r'\bgold\b|gold[\s,\.\;\:\!\?]'
    
    # Search for the pattern
    return bool(re.search(gold_pattern, summary_lower))

### Function to Fetch Company Info

In [None]:
def get_company_info(symbol: str, gics_industry_group: str) -> Optional[Dict]:
    """
    Fetch company information from Yahoo Finance for a given ASX symbol.
    Only process if market cap is between $20M and $500M.
    """
    try:
        yahoo_symbol = f"{symbol}.AX"
        ticker = yf.Ticker(yahoo_symbol)
        info = ticker.info
        
        # Check market cap first to avoid unnecessary processing
        market_cap = info.get('marketCap', 0)
        #if not (20_000_000 <= market_cap <= 500_000_000):
        #    return None
        
        # Check if it's a materials company
        if not is_materials_company(gics_industry_group):
            return None
        
        # Check if we got valid data
        if not info:
            return None
        
        return {
            'short_name': info.get('shortName', 'N/A'),
            'long_name': info.get('longName', 'N/A'),
            'stock_code': symbol,
            'market_cap': market_cap,
            'market_cap_formatted': f"${market_cap:,.0f}",
            'shares_outstanding': info.get('sharesOutstanding', 0),
            'shares_outstanding_formatted': f"{info.get('sharesOutstanding', 0):,.0f}",
            'gics_industry_group': gics_industry_group,
            'sector': info.get('sector', 'N/A'),
            'industry': info.get('industry', 'N/A'),
            'business_summary': info.get('longBusinessSummary', 'N/A'),
            'is_gold_company': is_gold_company(info.get('longBusinessSummary', '')),
            'current_price': info.get('currentPrice', 0),
            'currency': info.get('currency', 'AUD'),
            'address1': info.get('address1', 'N/A'),
            'address2': info.get('address2', 'N/A'),
            'city': info.get('city', 'N/A'),
            'state': info.get('state', 'N/A'),
            'zip': info.get('zip', 'N/A'),
            'country': info.get('country', 'N/A'),
            'phone': info.get('phone', 'N/A'),
            'website': info.get('website', 'N/A'),
            'sectors': info.get('sector', 'N/A'),  # Note: 'sectors' is same as 'sector' in yfinance
            'company_officers': info.get('companyOfficers', 'N/A')
        }
        
    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return None

### Main Execution

In [None]:
# Get list of ASX companies
print("Starting ASX Materials Companies Scraper")
print("=" * 50)

asx_companies = get_asx_companies()

if not asx_companies:
    print("Failed to fetch ASX companies list. Exiting.")
else:
    # Filter for Materials companies upfront
    materials_companies_list = [company for company in asx_companies if is_materials_company(company['gics_industry_group'])]
    
    materials_companies = []
    processed_count = 0
    
    print(f"\nProcessing {len(materials_companies_list)} Materials companies...")
    print("This may take a while due to API rate limiting...\n")
    
    for company in materials_companies_list:
        processed_count += 1
        
        if processed_count % 50 == 0:
            print(f"Processed {processed_count}/{len(materials_companies_list)} companies...")
        
        company_info = get_company_info(company['code'], company['gics_industry_group'])
        
        if company_info:
            materials_companies.append(company_info)
            gold_indicator = " 🏆" if company_info['is_gold_company'] else ""
            print(f"✓ Found: {company_info['short_name']} ({company_info['stock_code']}) - {company_info['market_cap_formatted']}{gold_indicator}")

### Display and Save Results

In [None]:
print(f"\n" + "=" * 80)
print(f"RESULTS: Found {len(materials_companies)} materials companies with market cap $20M-$500M")
print("=" * 80)

if materials_companies:
    # Sort by market cap (descending)
    materials_companies.sort(key=lambda x: x['market_cap'], reverse=True)
    
    # Display detailed results
    for i, company in enumerate(materials_companies, 1):
        gold_indicator = " 🏆 [GOLD]" if company['is_gold_company'] else ""
        print(f"\n{i}. {company['short_name']} ({company['stock_code']}){gold_indicator}")
        print(f"   Market Cap: {company['market_cap_formatted']}")
        print(f"   Outstanding Shares: {company['shares_outstanding_formatted']}")
        print(f"   Current Price: ${company['current_price']:.2f} {company['currency']}")
        print(f"   GICS Industry Group: {company['gics_industry_group']}")
        print(f"   Sector: {company['sector']}")
        print(f"   Industry: {company['industry']}")
        print(f"   Is Gold Company: {company['is_gold_company']}")
        
        summary = company['business_summary']
        if len(summary) > 200:
            summary = summary[:200] + "..."
        print(f"   Description: {summary}")
        print("-" * 80)
    
    # Save to CSV
    df = pd.DataFrame(materials_companies)
    csv_filename = 'asx_materials_companies.csv'
    df.to_csv(csv_filename, index=False)
    print(f"\nResults saved to: {csv_filename}")
    
    # Save to JSON
    json_filename = 'asx_materials_companies.json'
    with open(json_filename, 'w') as f:
        json.dump(materials_companies, f, indent=2)
    print(f"Detailed results saved to: {json_filename}")
else:
    print("No companies found matching the criteria.")