# ðŸ“Š Data Sourcing Guide for Projects

This notebook demonstrates various methods to source data for your projects, including:
1. Installing required libraries
2. Downloading from public repositories
3. Fetching data via APIs
4. Web scraping
5. Loading local files

## Install Required Libraries

In [None]:
# Install required libraries
!pip install requests pandas beautifulsoup4 lxml kaggle-api

In [None]:
import requests
import zipfile
import os
from pathlib import Path

def download_kaggle_dataset(dataset_name, download_path="./data"):
    """
    Download a dataset from Kaggle
    Note: Requires kaggle API key and kaggle package installed
    """
    try:
        # Create download directory
        Path(download_path).mkdir(parents=True, exist_ok=True)

        # Use kaggle API to download
        import kaggle
        kaggle.api.competition_download_files(dataset_name, path=download_path, quiet=False)

        print(f"Downloaded Kaggle dataset: {dataset_name}")
        return True
    except Exception as e:
        print(f"Error downloading from Kaggle: {e}")
        return False

def download_from_url(url, filename, download_path="./data"):
    """
    Download file from direct URL
    """
    try:
        Path(download_path).mkdir(parents=True, exist_ok=True)
        filepath = os.path.join(download_path, filename)

        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"Downloaded file: {filename}")
        return filepath
    except Exception as e:
        print(f"Error downloading file: {e}")
        return None

# Example usage
# Download from UCI ML Repository
# iris_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# download_from_url(iris_url, "iris.csv")

# Download from Kaggle (requires API setup)
# download_kaggle_dataset("titanic", "./data/titanic")

## Download Data from Public Repositories

You can download datasets from platforms like Kaggle, UCI Machine Learning Repository, or GitHub.

## Fetch Data Using APIs

Many services provide APIs to access their data programmatically.

In [None]:
# Example: Fetch GitHub repository information
def get_github_repo_info(owner, repo):
    """
    Get information about a GitHub repository
    """
    url = f"https://api.github.com/repos/{owner}/{repo}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching GitHub data: {e}")
        return None

# Example usage
# repo_info = get_github_repo_info("octocat", "Hello-World")
# if repo_info:
#     print(f"Repository: {repo_info['full_name']}")
#     print(f"Stars: {repo_info['stargazers_count']}")
#     print(f"Forks: {repo_info['forks_count']}")

# Example: Fetch cryptocurrency data from CoinGecko API
def get_crypto_data(crypto_id="bitcoin"):
    """
    Get current cryptocurrency data
    """
    url = f"https://api.coingecko.com/api/v3/simple/price"
    params = {
        'ids': crypto_id,
        'vs_currencies': 'usd',
        'include_24hr_change': 'true'
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching crypto data: {e}")
        return None

# Example usage
# btc_data = get_crypto_data("bitcoin")
# if btc_data:
#     print(f"Bitcoin price: ${btc_data['bitcoin']['usd']}")
#     print(f"24h change: {btc_data['bitcoin']['usd_24h_change']:.2f}%")

In [None]:
import requests
import json

# Example: Fetch weather data from OpenWeatherMap API
def get_weather_data(city, api_key):
    """
    Fetch current weather data for a city
    """
    base_url = "http://api.openweathermap.org/data/2.5/weather"
    params = {
        'q': city,
        'appid': api_key,
        'units': 'metric'
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching weather data: {e}")
        return None

# Example usage (replace with your API key)
# weather_data = get_weather_data("London", "your_api_key_here")
# print(json.dumps(weather_data, indent=2))

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

def scrape_table_from_url(url, table_index=0):
    """
    Scrape a table from a webpage and return as pandas DataFrame
    """
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        tables = soup.find_all('table')

        if table_index >= len(tables):
            print(f"No table found at index {table_index}")
            return None

        table = tables[table_index]

        # Extract headers
        headers = []
        header_row = table.find('thead')
        if header_row:
            headers = [th.text.strip() for th in header_row.find_all('th')]
        else:
            # Try first row as headers
            first_row = table.find('tr')
            if first_row:
                headers = [th.text.strip() for th in first_row.find_all(['th', 'td'])]

        # Extract data rows
        rows = []
        for row in table.find_all('tr')[1:]:  # Skip header row
            cols = [td.text.strip() for td in row.find_all(['td', 'th'])]
            if cols:
                rows.append(cols)

        # Create DataFrame
        df = pd.DataFrame(rows, columns=headers if headers else None)
        return df

    except Exception as e:
        print(f"Error scraping data: {e}")
        return None

# Example usage
# url = "https://en.wikipedia.org/wiki/List_of_countries_by_population"
# population_data = scrape_table_from_url(url, table_index=0)
# print(population_data.head())

## Load Data from Local Files

Pandas provides easy methods to load data from various file formats.

In [None]:
import pandas as pd
import json
import os

def load_csv_file(file_path):
    """Load data from CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded CSV file: {file_path}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

def load_json_file(file_path):
    """Load data from JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded JSON file: {file_path}")
        if isinstance(data, list):
            print(f"Number of records: {len(data)}")
        elif isinstance(data, dict):
            print(f"Keys: {list(data.keys())}")
        return data
    except Exception as e:
        print(f"Error loading JSON: {e}")
        return None

def load_excel_file(file_path, sheet_name=0):
    """Load data from Excel file"""
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        print(f"Loaded Excel file: {file_path}")
        print(f"Sheet: {sheet_name}")
        print(f"Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading Excel: {e}")
        return None

# Example usage
# csv_data = load_csv_file("data/sample.csv")
# json_data = load_json_file("data/sample.json")
# excel_data = load_excel_file("data/sample.xlsx")

print("Data loading functions defined. Ready to use!")

## Summary

This notebook has demonstrated various methods to source data for your projects:

1. **Library Installation**: Install necessary packages for data fetching and manipulation
2. **Public Repositories**: Download datasets from Kaggle, UCI, or GitHub
3. **API Integration**: Fetch real-time data from web services
4. **Web Scraping**: Extract data from websites using BeautifulSoup
5. **Local Files**: Load data from CSV, JSON, and Excel files

### Best Practices:
- Always check API terms of service before using
- Respect website scraping policies (robots.txt)
- Handle errors gracefully in your data fetching code
- Validate data quality after loading
- Consider data licensing and usage rights

### Next Steps:
- Choose the appropriate data sourcing method for your project
- Implement proper error handling and data validation
- Consider data storage and processing pipelines
- Document your data sources and collection methods

Happy data sourcing! ðŸ“Š