# E2 Sourcing data with an API
---

### Import and concatenate

In [4]:
#Import Libraries
import pandas as pd 
import numpy as np
import os 
import requests
import json
from datetime import datetime

In [6]:
# Create a list with all files in the folder using a list compehension

folderpath = r"C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 2\Citi-Bike_Bike-Sharing\Data\2022-citibike-tripdata"
filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [8]:
filepaths

['C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202201-citibike-tripdata_1.csv',
 'C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202201-citibike-tripdata_2.csv',
 'C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202202-citibike-tripdata_1.csv',
 'C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202202-citibike-tripdata_2.csv',
 'C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202203-citibike-tripdata_1.csv',
 'C:\\Users\\north\\OneDrive\\Dokumente\\Career Foundry\\Data Visualization 2\\Citi-Bike_Bike-Sharing\\Data\\2022-citibike-tripdata\\202203-citibike-tripdata_2.csv',
 'C:

In [12]:
import chardet

# Detect the encoding of the first file
with open(filepaths[0], 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [None]:
dfs = []
for f in filepaths:
    with open(f, mode='r', encoding='ascii', errors='replace') as file:
        dfs.append(pd.read_csv(file, low_memory=False, on_bad_lines='skip'))  # Use on_bad_lines='skip'

df = pd.concat(dfs, ignore_index=True)



In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

### Explanation:
#### Step 1

- The `os.listdir(folderpath)` function retrieves the list of all file names in the specified folder.
- The list comprehension `[os.path.join(folderpath, name) for name in os.listdir(folderpath)]` loops through these file names and combines them with the folder path using `os.path.join()`, creating full file paths.
- The result is a list, `filepaths`, that contains the paths to all CSV files in the folder. This allows easy iteration over the files for further processing.


#### Step 2

#### Generator Expression:
- `(pd.read_csv(f, low_memory=False) for f in filepaths)` is a generator expression.
- It iterates over the list of file paths (`filepaths`) and reads each file into a pandas DataFrame using `pd.read_csv()`.
- The `low_memory=False` parameter ensures that `pandas` processes the entire column at once, avoiding `DtypeWarning` when columns have mixed data types.
- Using a generator instead of a list comprehension (square brackets) avoids loading all files into memory at once, making the operation memory-efficient.

#### Combining DataFrames:
- The `pd.concat()` function combines all the DataFrames returned by the generator into a single DataFrame, `df`.
- By default, `pd.concat()` combines the data vertically, assuming the same column structure across all files.

#### Resetting Index:
- The `ignore_index=True` parameter ensures that the resulting DataFrame's row indices are reset and continuous, rather than retaining the original indices from individual files.
---


### Get weather data using NOAA's API

In [None]:
# Define NOAA token

Token = 'jrNumxHPxRYVkyaLjRyFPrAZdcNpvAoB'

In [None]:
# Specify the parameters
base_url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?'
params = {
    "datasetid": "GHCND",           # Global Historical Climatology Network Daily
    "datatypeid": "TAVG",           # Average temperature
    "limit": 1000,                  # Maximum items per request
    "stationid": "GHCND:USW00014732", # Station ID for LaGuardia Airport
    "startdate": "2022-01-01",      # Start date
    "enddate": "2022-12-31"         # End date
}

# Construct the API request URL
request_url = (f"{base_url}datasetid={params['datasetid']}&datatypeid={params['datatypeid']}&limit={params['limit']}&"
               f"stationid={params['stationid']}&startdate={params['startdate']}&enddate={params['enddate']}")

# Print the request URL for verification
print(f"Request URL: {request_url}")

In [None]:
# Make the API request
r = requests.get(request_url, headers={'token': Token})

# Load the API response as JSON
d = json.loads(r.text)


In [None]:
d

In [None]:
# Filter results for average temperature (TAVG)
avg_temps = [item for item in d['results'] if item['datatype'] == 'TAVG']

# Extract dates and temperatures from the filtered results
dates_temp = [item['date'] for item in avg_temps]
temps = [item['value'] for item in avg_temps]


In [None]:
temps

In [None]:
# Convert temperature values from tenths of Celsius to normal Celsius
temps_celsius = [temp / 10.0 for temp in temps]

# Convert date strings to `datetime.date` objects for consistency
dates = [datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").date() for date in dates_temp]

In [None]:
# Create a DataFrame from the extracted data
df_temps = pd.DataFrame({
    "date": dates,
    "avg_temp": temps_celsius
})

In [None]:
# Inspect the resulting DataFrame
print(df_temps.head(10))
print(df_temps.tail(10))

In [None]:
 df.dtypes

In [None]:
df_temps.dtypes

In [None]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df_temps['date'] = pd.to_datetime(df_temps['date'], dayfirst=True)


In [None]:
df['start_date'] = pd.to_datetime(df['start_date'], dayfirst=True)
df['end_date'] = pd.to_datetime(df['end_date'], dayfirst=True)


In [None]:
df['start_time'] = pd.to_datetime(df['start_time'], format='%H:%M:%S.%f', errors='coerce').dt.time
df['end_time'] = pd.to_datetime(df['end_time'], format='%H:%M:%S.%f', errors='coerce').dt.time


In [None]:
df.dtypes

In [None]:
df_temps.dtypes

In [None]:
df_temps.head()

In [None]:
df.head()

In [None]:
# Export df_temps to a CSV file
output_path = r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 2\Citi-Bike_Bike-Sharing\Data\df_temps.csv'
df_temps.to_csv(output_path, index=False)

print(f"DataFrame exported to '{output_path}'.")

In [None]:
%%time
df_merged = df.merge(df_temps, how='left', on='date', indicator=True)

In [None]:
df_merged.head()

In [None]:
df_merged['_merge'].value_counts(dropna = False)

In [None]:
# Export to CSV
output_path = r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 2\Citi-Bike_Bike-Sharing\Data\merged_citibike_weather.csv'
df_merged.to_csv(output_path, index=False)

print(f"Merged DataFrame exported to '{output_path}'.")