In [9]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') 

pickup_dt: Date and time of the data point.

borough: The borough in New York City (e.g., Bronx, Brooklyn, Manhattan, etc.).

pickups: The number of pickups recorded for that hour in the borough.

spd: Speed (assuming it's related to wind speed).

vsb: Visibility (assuming it's related to visibility conditions).

temp: Temperature.

dewp: Dew point.

slp: Sea level pressure.

pcp01: Precipitation in the last hour.

pcp06: Precipitation in the last 6 hours.

pcp24: Precipitation in the last 24 hours.

sd: Snow depth.

hday: A flag or indicator (perhaps for a holiday).

In [10]:
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv("Resources/uber_nyc_enriched.csv")
df

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
0,2015-01-01 01:00:00,Bronx,152,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
1,2015-01-01 01:00:00,Brooklyn,1519,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
2,2015-01-01 01:00:00,EWR,0,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
3,2015-01-01 01:00:00,Manhattan,5258,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
4,2015-01-01 01:00:00,Queens,405,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29096,2015-06-30 23:00:00,EWR,0,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29097,2015-06-30 23:00:00,Manhattan,3828,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29098,2015-06-30 23:00:00,Queens,580,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29099,2015-06-30 23:00:00,Staten Island,0,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N


In [11]:
# Get the information for columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29101 entries, 0 to 29100
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pickup_dt  29101 non-null  object 
 1   borough    26058 non-null  object 
 2   pickups    29101 non-null  int64  
 3   spd        29101 non-null  float64
 4   vsb        29101 non-null  float64
 5   temp       29101 non-null  float64
 6   dewp       29101 non-null  float64
 7   slp        29101 non-null  float64
 8   pcp01      29101 non-null  float64
 9   pcp06      29101 non-null  float64
 10  pcp24      29101 non-null  float64
 11  sd         29101 non-null  float64
 12  hday       29101 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 2.9+ MB


In [12]:
# Get a unique name in the "borough" column
df['borough'].unique()

array(['Bronx', 'Brooklyn', 'EWR', 'Manhattan', 'Queens', 'Staten Island',
       nan], dtype=object)

In [13]:
# Dropping NAN for column "borough" 
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26058 entries, 0 to 29099
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pickup_dt  26058 non-null  object 
 1   borough    26058 non-null  object 
 2   pickups    26058 non-null  int64  
 3   spd        26058 non-null  float64
 4   vsb        26058 non-null  float64
 5   temp       26058 non-null  float64
 6   dewp       26058 non-null  float64
 7   slp        26058 non-null  float64
 8   pcp01      26058 non-null  float64
 9   pcp06      26058 non-null  float64
 10  pcp24      26058 non-null  float64
 11  sd         26058 non-null  float64
 12  hday       26058 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 2.8+ MB


In [14]:
# Check unique names in the "borough" column.
df['borough'].unique()

array(['Bronx', 'Brooklyn', 'EWR', 'Manhattan', 'Queens', 'Staten Island'],
      dtype=object)

In [19]:
# Check for dublicated data
dublicated_data = df.groupby('pickup_dt')['borough'].size().unique()
dublicated_data

array([6], dtype=int64)

In [15]:
# Change the type of "pickup_dt" column from "object" to "datetime64".
df["pickup_dt"] = df["pickup_dt"].astype('datetime64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26058 entries, 0 to 29099
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   pickup_dt  26058 non-null  datetime64[ns]
 1   borough    26058 non-null  object        
 2   pickups    26058 non-null  int64         
 3   spd        26058 non-null  float64       
 4   vsb        26058 non-null  float64       
 5   temp       26058 non-null  float64       
 6   dewp       26058 non-null  float64       
 7   slp        26058 non-null  float64       
 8   pcp01      26058 non-null  float64       
 9   pcp06      26058 non-null  float64       
 10  pcp24      26058 non-null  float64       
 11  sd         26058 non-null  float64       
 12  hday       26058 non-null  object        
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 2.8+ MB


In [21]:
# Create a clean Data Frame
clean_df = df[['pickup_dt','borough', 'pickups', 'spd', 'vsb', 'temp', 'dewp',
                'slp','pcp01', 'pcp06', 'pcp24', 'sd', 'hday']]

clean_df.to_csv("Resources/clean1.csv", index = False)

In [17]:
# Separate the date and time
df["pickup_date"] = df["pickup_dt"].dt.date
df["pickup_time"] = df["pickup_dt"].dt.time

# Create a clean Data Frame
clean_df = df[['pickup_date', 'pickup_time','borough', 'pickups', 'spd', 'vsb', 'temp', 'dewp',
                'slp','pcp01', 'pcp06', 'pcp24', 'sd', 'hday']]
clean_df

Unnamed: 0,pickup_date,pickup_time,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
0,2015-01-01,01:00:00,Bronx,152,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
1,2015-01-01,01:00:00,Brooklyn,1519,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
2,2015-01-01,01:00:00,EWR,0,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
3,2015-01-01,01:00:00,Manhattan,5258,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
4,2015-01-01,01:00:00,Queens,405,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29095,2015-06-30,23:00:00,Brooklyn,990,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29096,2015-06-30,23:00:00,EWR,0,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29097,2015-06-30,23:00:00,Manhattan,3828,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29098,2015-06-30,23:00:00,Queens,580,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N


In [18]:
# Save clean Data Frame to resources for working on other questions.
clean_df.to_csv("Resources/clean.csv", index = False)