# Cleaning Data in Python

## Data type constraints

In [None]:
# for pandas - convert string to int
sales['Revenue'] = sales['Revenue'].str.strip('$')
sales['Revenue'] = sales['Revenue'].astype('int')

# verify if it is converted
assert sales['Revenue'].dtype =='int'

In [None]:
import pandas as pd
filename = "ride_sharing_new.csv"
ride_sharing = pd.read_csv(filename, sep = ',')

# Strip duration of minutes
ride_sharing['duration_trim'] = ride_sharing['duration'].str.strip('minutes')

# Convert duration to integer
ride_sharing['duration_time'] = ride_sharing['duration_trim'].astype('int')

# Write an assert statement making sure of conversion
assert ride_sharing['duration_time'].dtype == 'int'

# Print formed columns and calculate average ride duration 
print(ride_sharing[['duration','duration_trim','duration_time']])
print(ride_sharing['duration_time'].mean())

## Data range constraints

### How to deal with out of range data?
- Dropping data
- Setting custom minimums and maximums
- Treat as missing and impute
- Setting custom value depending on business assumptions

In [None]:
import pandas as pd
import datetime as dt
# Output movies with rating > 5
movies[movies['avg_rating']>5]

# Drop values using filtering
movies = movies[movies['avg_rating']<=5]

# Drop values using .drop()
movies.drop(movies[movies['avg_rating']>5].index, inplace = True)

# Convert avg_rating >5 to 5
movies.loc[movies['avg_rating']>5, 'avg_rating'] = 5

# Convert to date
user_signups['subcription_date'] = pd.to_datetime(user_signups['subcription_date']).dt.date

today_date = dt.date.today()

# Drop values using filtering
user_signups = user_signups[user_signups['subcription_date']< today_date] 

# Drop values using.drop
user_signups.drop(user_signups[user_signups[subcription_date]>today_date].index, inplace = True)

## Uniqueness Constraints

In [None]:
# Get duplicates across all columns
duplicates = height_weight.duplicated()

# Get duplicates rows
height_weight[duplicates]

### How to find duplicate rows?

The .duplicated() method
- subset: List of column names to check for duplications
- keep: Whether to keep first ('first'), last ('last') or all (False) duplicate values

The .drop_duplicates()method
- subset: List of column names to check for duplications
- keep: Whether to keep first ('first'), last ('last') or all (False) duplicate values
- inplace: Drop duplicated rows directly inside DataFrame without creating new object(True)

In [None]:
import pandas as pd
filename = "ride_sharing_new.csv"

# Column names to check for duplication
column_names = ['first_name', 'last_name','address']
duplicates = height_weight.duplicated(subset=column_names, keep = False)

# Sort values 
height_weight[duplicates].sort_values(by = 'first_name')

# Drop complete duplicates
height_weight.drop_duplicates(inplace = True)

# Group by column names and produce statistical summaries
summaries = {'height':'max','weight':'mean'}
height_weight = height_weight.groupby(by = column_names).agg(summaries).reset_index()

In [None]:
ride_sharing = pd.read_csv(filename, sep=',')

# Load the dataset
filename = "ride_sharing_new.csv"
ride_sharing = pd.read_csv(filename, sep=',')

# Find duplicates
duplicates = ride_sharing.duplicated(subset = 'ride_id', keep = False)

# Sort your duplicated rides
duplicated_rides = ride_sharing[duplicates].sort_values('ride_id')

# Print relevant columns of duplicated_rides
print(duplicated_rides[['ride_id','duration','user_birth_year']])

In [None]:
# Drop complete duplicates from ride_sharing
ride_dup = ride_sharing.drop_duplicates()

# Create statistics dictionary for aggregation function
statistics = {'user_birth_year': 'min', 'duration': 'mean'}

# Group by ride_id and compute new statistics
ride_unique = ride_dup.groupby(by = 'ride_id').agg(statistics).reset_index()

# Find duplicated values again
duplicates = ride_unique.duplicated(subset = 'ride_id', keep = False)
duplicated_rides = ride_unique[duplicates == True]

# Assert duplicates are processed
assert duplicated_rides.shape[0] == 0