In [1]:
# Load libraries
import pandas as pd
import os

In [2]:
# Load Files
all_files = [file for file in os.listdir() if file.endswith('.csv')]
print(all_files)

['202201-citibike-tripdata.csv', '202208-citibike-tripdata.csv']


In [3]:
# Load files into single DataFrame

list_of_dataframes = []

for file in all_files:
    print(f"Reading file: {file}")  # Confirm which files are being read
    df = pd.read_csv(file)
    list_of_dataframes.append(df)

combined_df = pd.concat(list_of_dataframes, axis=0, ignore_index=True)

Reading file: 202201-citibike-tripdata.csv


  df = pd.read_csv(file)


Reading file: 202208-citibike-tripdata.csv


  df = pd.read_csv(file)


In [4]:
# Check DF parameters

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4733727 entries, 0 to 4733726
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 469.5+ MB


In [5]:
#Check DataFrame
combined_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.99116,40.692395,-73.993379,member
1,D272F1B15D841EC0,classic_bike,2022-01-21 09:03:22,2022-01-21 09:05:44,E 12 St & Ave C,5616.08,E 10 St & Avenue A,5659.05,40.727243,-73.976831,40.727408,-73.98142,member
2,D1FCEF55EB4A807F,classic_bike,2022-01-22 14:28:32,2022-01-22 14:53:18,W 21 St & 6 Ave,6140.05,W 44 St & 11 Ave,6756.05,40.74174,-73.994156,40.762009,-73.996975,member
3,E9CBDC6A0162C068,electric_bike,2022-01-19 14:49:47,2022-01-19 14:54:02,38 St & 30 Ave,6850.01,Crescent St & 30 Ave,6958.06,40.764175,-73.91584,40.768692,-73.924957,member
4,2177A5B57326CE9B,electric_bike,2022-01-16 14:36:06,2022-01-16 14:44:06,Pacific St & Nevins St,4362.04,Clinton St & Tillary St,4748.07,40.685376,-73.983021,40.696233,-73.991421,member


In [6]:
# Convert to datetime format
combined_df['started_at'] = pd.to_datetime(combined_df['started_at'])
combined_df['ended_at'] = pd.to_datetime(combined_df['ended_at'])

# Convert to categorical data type
combined_df['rideable_type'] = combined_df['rideable_type'].astype('category')
combined_df['member_casual'] = combined_df['member_casual'].astype('category')

# Drop NaN values
combined_df.dropna(inplace=True)
combined_df.drop_duplicates(inplace=True)

In [7]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4714091 entries, 0 to 4733726
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       category      
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       category      
dtypes: category(2), datetime64[ns](2), float64(4), object(5)
memory usage: 440.6+ MB


In [8]:
combined_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.991160,40.692395,-73.993379,member
1,D272F1B15D841EC0,classic_bike,2022-01-21 09:03:22,2022-01-21 09:05:44,E 12 St & Ave C,5616.08,E 10 St & Avenue A,5659.05,40.727243,-73.976831,40.727408,-73.981420,member
2,D1FCEF55EB4A807F,classic_bike,2022-01-22 14:28:32,2022-01-22 14:53:18,W 21 St & 6 Ave,6140.05,W 44 St & 11 Ave,6756.05,40.741740,-73.994156,40.762009,-73.996975,member
3,E9CBDC6A0162C068,electric_bike,2022-01-19 14:49:47,2022-01-19 14:54:02,38 St & 30 Ave,6850.01,Crescent St & 30 Ave,6958.06,40.764175,-73.915840,40.768692,-73.924957,member
4,2177A5B57326CE9B,electric_bike,2022-01-16 14:36:06,2022-01-16 14:44:06,Pacific St & Nevins St,4362.04,Clinton St & Tillary St,4748.07,40.685376,-73.983021,40.696233,-73.991421,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4733722,231398EE256269B0,classic_bike,2022-08-17 17:48:50,2022-08-17 18:06:13,27 St & Hunter St,6310.06,E 54 St & 1 Ave,6608.09,40.748500,-73.941275,40.756265,-73.964179,member
4733723,2F9EC3A7F60523B1,classic_bike,2022-08-08 23:02:58,2022-08-08 23:19:10,W 22 St & 10 Ave,6306.06,W 15 St & 6 Ave,5989.02,40.746920,-74.004519,40.738046,-73.996430,member
4733724,094B17D42252E33E,classic_bike,2022-08-27 11:10:59,2022-08-27 11:14:03,27 St & Hunter St,6310.06,11 St & 43 Ave,6438.04,40.748500,-73.941275,40.751907,-73.947912,member
4733725,9CEB6714CBEC7386,classic_bike,2022-08-29 22:55:27,2022-08-29 23:13:10,Washington Pl & Broadway,5755.01,Canal St & Rutgers St,5303.08,40.729039,-73.994046,40.714275,-73.989900,member


In [9]:
if combined_df.isna().any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("There are no NaN values in the DataFrame.")

There are no NaN values in the DataFrame.


In [None]:
combined_df.to_csv('cleaned_data_Citibike.csv', index=False)