## Cleaning data in Python

+ Using `assert` statements to verify data quality
+ Some data might appear to be numeric but actually be categorical
  + e.g. `marriage_status`: 0 = never married, 1 = married, 2 = separated, 3 = divorced
  + used `.astype('category')` as described in the **Working with Categorical Data in Python** course.
  + after using `.astype('category')` on a categorical column, running `describe()` will make more sense (better alignment)

In [1]:
import pandas as pd

df_ride_sharing = pd.read_csv('./data/ride_sharing_new.csv')
df_ride_sharing.head()

Unnamed: 0.1,Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male
2,2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male
3,3,4 minutes,16,Steuart St at Market St,28,The Embarcadero at Bryant St,1883,1,1979,Male
4,4,11 minutes,22,Howard St at Beale St,350,8th St at Brannan St,4626,2,1994,Male


In [2]:
# Print the information of ride_sharing
print(df_ride_sharing.info(), "\n")

# Print summary statistics of user_type column
print(df_ride_sharing['user_type'].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25760 non-null  int64 
 1   duration         25760 non-null  object
 2   station_A_id     25760 non-null  int64 
 3   station_A_name   25760 non-null  object
 4   station_B_id     25760 non-null  int64 
 5   station_B_name   25760 non-null  object
 6   bike_id          25760 non-null  int64 
 7   user_type        25760 non-null  int64 
 8   user_birth_year  25760 non-null  int64 
 9   user_gender      25760 non-null  object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB
None 

count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64


In [3]:
# Print the information of ride_sharing
print(df_ride_sharing.info(), "\n")

# Print summary statistics of user_type column
print(df_ride_sharing['user_type'].describe(), "\n")

# Convert user_type from integer to category
df_ride_sharing['user_type_cat'] = df_ride_sharing['user_type'].astype('category')

# Write an assert statement confirming the change
assert df_ride_sharing['user_type_cat'].dtype == 'category'

# Print new summary statistics 
print(df_ride_sharing['user_type_cat'].describe(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25760 non-null  int64 
 1   duration         25760 non-null  object
 2   station_A_id     25760 non-null  int64 
 3   station_A_name   25760 non-null  object
 4   station_B_id     25760 non-null  int64 
 5   station_B_name   25760 non-null  object
 6   bike_id          25760 non-null  int64 
 7   user_type        25760 non-null  int64 
 8   user_birth_year  25760 non-null  int64 
 9   user_gender      25760 non-null  object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB
None 

count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64 

count     25760
unique        3
top           2
freq      12972
Name: user_type_cat,

In [4]:
# Strip duration of minutes
df_ride_sharing['duration_trim'] = df_ride_sharing['duration'].str.strip('minutes')

# Convert duration to integer
df_ride_sharing['duration_time'] = df_ride_sharing['duration_trim'].astype(int)

# Write an assert statement making sure of conversion
assert df_ride_sharing['duration_time'].dtype == 'int'

## Data range constraints

+ To follow along with the lectures, the `tire_size` and `ride_date` columns needed to be added to the data
  + The code to add these columns is commented out because they only needed to be run once.


In [5]:
print(df_ride_sharing.shape)
df_ride_sharing.columns

(25760, 13)


Index(['Unnamed: 0', 'duration', 'station_A_id', 'station_A_name',
       'station_B_id', 'station_B_name', 'bike_id', 'user_type',
       'user_birth_year', 'user_gender', 'user_type_cat', 'duration_trim',
       'duration_time'],
      dtype='object')

In [6]:
import numpy as np
import random

# # add a tire_sizes column (because it doesn't exist in the original data) to illustrate
# tire_sizes = [26, 27, 29]
# tire_size_probabilities = 100 * np.array([0.485, 0.313, 0.202])  # these were the proportions found on the execise data
# df_ride_sharing['tire_sizes'] = random.choices(tire_sizes, weights=tuple(tire_size_probabilities), k=df_ride_sharing.shape[0])
# # check that values follow specified distribution
# df_ride_sharing['tire_sizes'].value_counts(normalize=True)  # looks good:
# tire_sizes
# 26    0.487189
# 27    0.314946
# 29    0.197865

In [7]:
# write df with new column added
# df_ride_sharing.to_csv('./data/ride_sharing.csv')

In [8]:
# re-read the ride sharing data file with the new column
df_ride_sharing = pd.read_csv('./data/ride_sharing.csv')

# Convert tire_sizes to integer
df_ride_sharing['tire_sizes'] = df_ride_sharing['tire_sizes'].astype('int')

# Set all values above 27 to 27
df_ride_sharing.loc[df_ride_sharing['tire_sizes'] > 27, 'tire_sizes'] = 27

# Reconvert tire_sizes back to categorical
df_ride_sharing['tire_sizes'] = df_ride_sharing['tire_sizes'].astype('category')

# Print tire size description
print(df_ride_sharing['tire_sizes'].describe())

count     25760
unique        2
top          27
freq      13210
Name: tire_sizes, dtype: int64


In [9]:
import datetime as dt

# generate random dates between 5 years ago and 1 year into the future
# which is roughly what the exercise data was found to be
# rand_dates = []
# today_date = dt.date.today()
# for date_row in range(df_ride_sharing.shape[0]):
#     # https://stackoverflow.com/questions/553303/how-to-generate-a-random-date-between-two-other-dates#61383231
#     rand_delta = random.randint(-5*365, 365)
#     rand_date = today_date + dt.timedelta(days=rand_delta)
#     rand_dates.append(rand_date.strftime("%Y-%m-%d"))

# df_ride_sharing['ride_date'] = rand_dates
# print(df_ride_sharing['ride_date'].iloc[:11])  # looks good

In [10]:
# write df with new column added
# df_ride_sharing.to_csv('./data/ride_sharing.csv')

In [11]:
# Convert ride_date to date
df_ride_sharing['ride_dt'] = pd.to_datetime(df_ride_sharing['ride_date']).dt.date

# Save today's date
today = dt.date.today()

# Set all in the future to today's date
df_ride_sharing.loc[df_ride_sharing['ride_dt'] > today, 'ride_dt'] = today

# Print maximum of ride_dt column
print(df_ride_sharing['ride_dt'].max())

2025-04-02


## Uniqueness contraints - dealing with duplicates

+ **TODO** need to add `ride_id` column to test code in next 2 cells

In [12]:
# Find duplicates
# duplicates = ride_sharing.duplicated(['ride_id'], keep=False)

# Sort your duplicated rides
# duplicated_rides = ride_sharing[duplicates].sort_values('ride_id')

# Print relevant columns of duplicated_rides
# print(duplicated_rides[['ride_id','duration','user_birth_year']])

In [13]:
# Drop complete duplicates from ride_sharing
# ride_dup = df_ride_sharing.drop_duplicates(inplace=False)

# Create statistics dictionary for aggregation function
# statistics = {'user_birth_year': 'min', 'duration': 'mean'}

# Group by ride_id and compute new statistics
# ride_unique = ride_dup.groupby('ride_id').agg(statistics).reset_index()

# Find duplicated values again
# duplicates = ride_unique.duplicated(subset = 'ride_id', keep = False)
# duplicated_rides = ride_unique[duplicates == True]

# Assert duplicates are processed
# assert duplicated_rides.shape[0] == 0

KeyError: 'ride_id'