In [8]:
import pandas as pd
import certifi
import urllib3
from urllib3 import request
import json
import psycopg2
from sqlalchemy import create_engine



In [9]:
#Extracting data from csv file
def source_data_from_csv(csv_file):
    try:
        df_csv=pd.read_csv(csv_file)
    except Exception as e:
        df_csv=pd.DataFrame()
    return df_csv        
    


    

In [10]:
#Extracting data from parquet file
def source_data_from_parquet(parquet_file):
    try:
        df_parquet=pd.read_csv(parquet_file)
    except Exception as e:
        df_parquet=pd.DataFrame()
    return df_parquet

In [11]:
#extracting data using Newyork api
def source_data_from_api(api_endpoint):
    try:
        # Create a Pool manager that can be used to read the API response
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        api_response = http.request('GET', api_endpoint)
        apt_status = api_response.status
        
        if apt_status == 200:
            # Parse the response data
            data = json.loads(api_response.data.decode('utf-8'))
            df_api = pd.json_normalize(data)
        else:
            df_api = pd.DataFrame()
    except Exception as e:
        print(f"An error occurred: {e}")
        df_api = pd.DataFrame()
    
    return df_api


In [12]:
#extracting data from postgresql
def source_data_from_table(db_name, user, password, host, port, table_name):
    try:
        # Create a database URL for SQLAlchemy
        db_url = f'postgresql://{user}:{password}@{host}:{port}/{db_name}'
        
        # Create an SQLAlchemy engine
        engine = create_engine(db_url)
        
        # Read SQL query results into a pandas DataFrame
        df_table = pd.read_sql(f"SELECT * FROM {table_name}", engine)
    except Exception as e:
        print(f"An error occurred: {e}")
        df_table = pd.DataFrame()
    
    return df_table

In [13]:
#extracting data from wikipedia webpage
def source_data_from_html(url,matching_word):
    try:
        df_html = pd.read_html(url, match = matching_word)
        df_html = df_html[0]
    except Exception as e:
        df_html = pd.DataFrame()
    return df_html

In [14]:
#grouping all data sources
def extracted_data():
        parquet_file = "/Users/joe/Desktop/ETL Pipelines/yellow_tripdata_2022-01.parquet"
        csv_file = "/Users/joe/Desktop/ETL Pipelines/Newyork_carcrashdata.csv"
        api_endpoint = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500"
        db_name = "analysis_sql"
        table_name = "companies_for_sale"
        user = 'postgres'
        password = '3884'
        host = 'localhost'
        port = '5432'
        url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
        matching_word = "by country"
   
        df_parquet,df_csv,df_api,df_table,df_html = (source_data_from_parquet(parquet_file),
                                                     source_data_from_csv(csv_file),
                                                     source_data_from_api(api_endpoint),
                                                     source_data_from_table(db_name, user, password, host, port, table_name),
                                                     source_data_from_html(url,matching_word))
        return df_parquet,df_csv,df_api,df_table,df_html   

In [20]:
# Call the extracted_data function to get all DataFrames
df_parquet, df_csv, df_api, df_table, df_html = extracted_data()

# Check the head of the DataFrame df_api
df_api.head()


Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,cross_street_name,location.latitude,location.longitude,location.human_address,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,1211 LORING AVENUE,40.667202,-73.8665,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",,,,,,
4,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,,40.683304,-73.917274,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",,,,,,
