In [11]:
import pandas as pd
import certifi
import urllib3
from urllib3 import request
import json
import psycopg2
from sqlalchemy import create_engine



In [12]:
#Extracting data from csv file
def source_data_from_csv(csv_file):
    try:
        df_csv=pd.read_csv(csv_file)
    except Exception as e:
        df_csv=pd.DataFrame()
    return df_csv        
    


    

In [13]:
#Extracting data from parquet file
def source_data_from_parquet(parquet_file):
    try:
        df_parquet=pd.read_parquet(parquet_file)
    except Exception as e:
        df_parquet=pd.DataFrame()
    return df_parquet

In [14]:
#extracting data using Newyork api
def source_data_from_api(api_endpoint):
    try:
        # Create a Pool manager that can be used to read the API response
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        api_response = http.request('GET', api_endpoint)
        apt_status = api_response.status
        
        if apt_status == 200:
            # Parse the response data
            data = json.loads(api_response.data.decode('utf-8'))
            df_api = pd.json_normalize(data)
        else:
            df_api = pd.DataFrame()
    except Exception as e:
        print(f"An error occurred: {e}")
        df_api = pd.DataFrame()
    
    return df_api


In [15]:
#extracting data from postgresql
def source_data_from_table(db_name, user, password, host, port, table_name):
    try:
        # Create a database URL for SQLAlchemy
        db_url = f'postgresql://{user}:{password}@{host}:{port}/{db_name}'
        
        # Create an SQLAlchemy engine
        engine = create_engine(db_url)
        
        # Read SQL query results into a pandas DataFrame
        df_table = pd.read_sql(f"SELECT * FROM {table_name}", engine)
    except Exception as e:
        print(f"An error occurred: {e}")
        df_table = pd.DataFrame()
    
    return df_table

In [16]:
#extracting data from wikipedia webpage
def source_data_from_html(url,matching_word):
    try:
        df_html = pd.read_html(url, match = matching_word)
        df_html = df_html[0]
    except Exception as e:
        df_html = pd.DataFrame()
    return df_html

In [17]:
#grouping all data sources
def extracted_data():
        parquet_file = "/Users/joe/Desktop/ETL Pipelines/data/data for extraction/yellow_tripdata_2022-01.parquet"
        csv_file = "/Users/joe/Desktop/ETL Pipelines/data/data for extraction/Newyork_carcrashdata.csv"
        api_endpoint = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500"
        db_name = "analysis_sql"
        table_name = "companies_for_sale"
        user = 'postgres'
        password = '3884'
        host = 'localhost'
        port = '5432'
        url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
        matching_word = "by country"
   
        df_parquet,df_csv,df_api,df_table,df_html = (source_data_from_parquet(parquet_file),
                                                     source_data_from_csv(csv_file),
                                                     source_data_from_api(api_endpoint),
                                                     source_data_from_table(db_name, user, password, host, port, table_name),
                                                     source_data_from_html(url,matching_word))
        return df_parquet,df_csv,df_api,df_table,df_html   

In [19]:
# Call the extracted_data function to get all DataFrames
df_parquet, df_csv, df_api, df_table, df_html = extracted_data()

# Check the head of the DataFrame df_api
df_parquet.head(5)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0
