# This script contains the following: 
## 1. Import libraries and data
## 2. Import weather data
## 3. Wrangle weather data
## 4. Merge data

### 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

In [2]:
# Create path
folderpath = r'/Users/ianfleming/Documents/2022-citibike-tripdata'

In [3]:
print(os.path.exists(folderpath))

True


In [4]:
# Create a list with all files in the folder
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]
filepaths[:3]  # to check

['/Users/ianfleming/Documents/2022-citibike-tripdata/202209-citibike-tripdata.zip',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/202201-citibike-tripdata.zip',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/.DS_Store']

I had two options- either unzip each months file manually in the folder or import zipfile library and unzip them in Jupyter. I chose the later for efficiently and new learning. 

In [5]:
import zipfile

In [6]:
# Loop through all zip files in the folder and extract them
for file in os.listdir(folderpath):
    if file.endswith(".zip"):
        file_path = os.path.join(folderpath, file)
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(folderpath)

In [7]:
print("All .zip files have been extracted successfully!")

All .zip files have been extracted successfully!


In [8]:
# List CSVs to double-check
csv_files = [name for name in os.listdir(folderpath) if name.endswith(".csv")]
print("CSV files found:\n", csv_files[:5], "...")


CSV files found:
 ['202208-citibike-tripdata_3.csv', '202207-citibike-tripdata_2.csv', '202207-citibike-tripdata_3.csv', '202208-citibike-tripdata_2.csv', '202207-citibike-tripdata_1.csv'] ...


In [9]:
# Recreate the filepaths list for the extracted CSVs
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if name.endswith(".csv")]
filepaths[:5]  # quick check

['/Users/ianfleming/Documents/2022-citibike-tripdata/202208-citibike-tripdata_3.csv',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/202207-citibike-tripdata_2.csv',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/202207-citibike-tripdata_3.csv',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/202208-citibike-tripdata_2.csv',
 '/Users/ianfleming/Documents/2022-citibike-tripdata/202207-citibike-tripdata_1.csv']

In [11]:
# Create dataframe
df_list = [pd.read_csv(file, low_memory=False) for file in filepaths]


In [12]:
# Create dataframe
citibike_2022 = pd.concat(df_list, ignore_index=True)


In [13]:
# Check dataframe
citibike_2022.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9D0DC440CB40CF8E,electric_bike,2022-08-27 13:56:47.728,2022-08-27 14:02:56.651,Flatbush Ave & Ocean Ave,3704.04,3 St & Prospect Park West,3865.05,40.663657,-73.963014,40.668132,-73.973638,casual
1,2214991DFBE5C4D7,electric_bike,2022-08-20 10:37:02.756,2022-08-20 10:45:56.631,Forsyth St\t& Grand St,5382.07,E 11 St & 1 Ave,5746.14,40.717798,-73.993161,40.729538,-73.984267,casual
2,20C5D469563B6337,classic_bike,2022-08-31 18:55:03.051,2022-08-31 19:03:37.344,Perry St & Bleecker St,5922.07,Grand St & Greene St,5500.02,40.735354,-74.004831,40.7217,-74.002381,member
3,3E8791885BC189D1,classic_bike,2022-08-02 08:05:00.250,2022-08-02 08:16:52.063,FDR Drive & E 35 St,6230.04,Grand Army Plaza & Central Park S,6839.1,40.744219,-73.971212,40.764397,-73.973715,member
4,8DBCBF98885106CB,electric_bike,2022-08-25 15:44:48.386,2022-08-25 15:55:39.691,E 40 St & 5 Ave,6474.11,Ave A & E 14 St,5779.11,40.752052,-73.982115,40.730311,-73.980472,member


In [14]:
citibike_2022.tail()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
29838801,1F223EDAFF420AE3,electric_bike,2022-12-01 20:26:45.847,2022-12-01 20:30:46.012,Avenue D & E 3 St,5436.09,Stanton St & Chrystie St,5523.02,40.720701,-73.977939,40.722293,-73.991475,member
29838802,CFA5C560ACB73B8E,classic_bike,2022-12-26 13:46:34.237,2022-12-26 13:52:43.900,43 Ave & 47 St,6209.05,39 Ave & 45 St,6401.03,40.744806,-73.91729,40.749478,-73.918265,member
29838803,11C8C5E0DB947B07,classic_bike,2022-12-01 05:56:14.903,2022-12-01 06:06:10.357,Avenue D & E 3 St,5436.09,Bleecker St & Crosby St,5679.08,40.720828,-73.977932,40.726156,-73.995102,member
29838804,5B9B083C534A5964,classic_bike,2022-12-02 11:54:15.871,2022-12-02 12:01:00.747,Montague St & Clinton St,4677.06,Sands St & Jay St,4821.03,40.694271,-73.992327,40.700119,-73.9862,member
29838805,91C286C462F89A50,classic_bike,2022-12-18 13:35:22.574,2022-12-18 13:37:27.193,Montague St & Clinton St,4677.06,Cadman Plaza E & Tillary St,4677.01,40.694271,-73.992327,40.695977,-73.990149,member


In [15]:
# Check shape
citibike_2022.shape


(29838806, 13)

In [16]:
# Check info
citibike_2022.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29838806 entries, 0 to 29838805
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 2.9+ GB


In this step, I unzipped all the files using the 'zipfile' library so they were CSVs. Then I created a list of all file paths in the data folder using the os library. Each file was then read into a pandas DataFrame using pd.read_csv() within a list comprehension, which provides a concise and efficient way to import multiple files at once.
Finally, I used pd.concat() with ignore_index=True to join all individual DataFrames together into one continuous dataset representing the full year of 2022.

### 2. Import weather data

In [17]:
# Import token
NOAA_TOKEN = 'XkPArTbiwSMQXqLdXkhbbhxRDjXDSxxK'

In [18]:
# Compile URL
url = (
    "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?"
    "datasetid=GHCND&"
    "datatypeid=TAVG&"
    "limit=1000&"
    "stationid=GHCND:USW00014732&"
    "startdate=2022-01-01&"
    "enddate=2022-12-31"
)

I told the API to only return the average temperature data

In [20]:
r = requests.get(url, headers={"token": NOAA_TOKEN})


In [21]:
# Store data in JSON format
d = json.loads(r.text)


### 3. Wrangle weather data

In [22]:
dates = [item["date"][:10] for item in d["results"]]

In [23]:
temps = [item['value'] for item in d['results']]

In [24]:
# Create weather dataframe
weather = pd.DataFrame({'date': dates, 'avg_temp': temps})

In [25]:
# Check what temperature values look like
[item['value'] for item in d['results'][:5]]


[116, 114, 14, -27, 32]

I am double checking the temps because I am unsure if they are fahrenheit or celsius or if they are stored in tenths of degrees celsius

In [26]:
# Check min and max
weather['avg_temp'].min(), weather['avg_temp'].max()


(-117, 313)

Yes, they are definitely in tenths of degrees celsius

In [27]:
# Divide by ten
weather['avg_temp'] = weather['avg_temp'] / 10

In [28]:
# Check min and max again
weather['avg_temp'].min(), weather['avg_temp'].max()

(-11.7, 31.3)

In [29]:
# Check dataframe
weather.head()

Unnamed: 0,date,avg_temp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


### 4. Merge data

'Weather' dataframe has 'date' column and 'citibike_2022' has 'started_at' column, i will change 'citibike_2022' to 'date' to allign with 'weather' and merge using that

In [34]:
citibike_2022['date'] = pd.to_datetime(citibike_2022['started_at']).dt.date

In [35]:
weather['date'] = pd.to_datetime(weather['date']).dt.date

In [36]:
# Merge dataframes
merged_data = pd.merge(citibike_2022, weather, on='date', how='left')


In [37]:
# Check merge
merged_data[['date', 'avg_temp']].head()

Unnamed: 0,date,avg_temp
0,2022-08-27,27.8
1,2022-08-20,27.9
2,2022-08-31,25.6
3,2022-08-02,26.4
4,2022-08-25,28.1


In [38]:
# Check merge
merged_data['avg_temp'].isna().sum()


np.int64(640)

There are 640 NaNs 

In [39]:
# Export merged_data dataframe
merged_data.to_csv("citibike_weather_merged_2022.csv", index=False)
