# DATA CLEANING

### String to Int | Delete $ (Strip)

In [None]:
# Remove $ from Revenue column
sales['Revenue'] = sales['Revenue'].str.strip('$')
sales['Revenue'] = sales['Revenue'].astype('int')

In [None]:
# Verify that Revenue is now an integer
assert sales['Revenue'].dtype == 'int'

## The assert statement

In [19]:
# This will pass
assert 1+1 == 2

## Numeric or categorical?

In [None]:
# Convert to categorical
df["marriage_status"] = df["marriage_status"].astype('category')
df.describe()

## Can future sign-ups exist?

In [None]:
# Import date time
import datetime as dt
today_date = dt.date. today()
user_signups[user_signups|'subscription_date'] > dt. date. today ()1

### DROP VALUES (TWO WAYS)

In [None]:
# Drop values using filtering
movies = movies[movies['avg_rating'] ≤ 5]
# Drop values using .drop()
movies.drop (movies[movies['avg_rating'] > 5].index, inplace =

## Check deleting values

In [None]:
# Assert results
assert movies['avg_rating'].max() <=

## Third Way - Replace Values

In [None]:
# Convert avg_rating > 5 to 5
movies.loc[movies['avg_rating'] ≥ 5, 'avg_rating'] = 5

# Cleaning Date

## Convert to Date

In [None]:
# Convert to date
user_signupsI'subscription_date'] = pd.to_datetime(user_signups['subscription_date ']).dt.date

In [None]:
today = dt.date.today()

# Drop the data
# Drop values using filtering
user_signups = user_signups[user_signups['subscription_date'] ≤ today_date]

# Drop values using .drop()
user_signups.drop(user_signups[user_signups['subscription_date'] > today_date].index, inplace = True)

In [None]:
# Drop values using filtering
user_signups.loc[user_signups['subscription_date'] > today_date, 'subscription_date'] = today_date

## Duplicate

In [None]:
# Get duplicates across all columns
duplicates = height_weight.duplicated()
print(duplicates)

True, False

In [None]:
height_weight['duplicates']

### Check duplicated (3 cloumns)

In [None]:
# Column names to check for duplication
column_names = ['first_name','last_name', 'address']
duplicates = height_weight.duplicated(subset = column_names, keep = False)

In [None]:
# Output duplicate values
height_weight[duplicates].sort_values(by = 'first_name')

### Drop complete duplicates

In [None]:
# Drop duplicates
height_weight.drop_duplicates(inplace = True)

### A statistical measure to combine each set of duplicated values

#### The groupby and agg() methods

In [None]:
# Group by column names and produce statistical summaries
column_names = ['first_name'|'last_name', 'address']
summaries = {'height': 'max', 'weight': 'mean'}
height_weight = height_weight.groupby(by = column_names).agg(summaries).reset_index()

In [None]:
# Make sure aggregation is done
duplicates = height_weight.duplicated(subset = column_names, keep = False)
height_weight[duplicates].sort_values(by = 'first_name')

# Membership constraints

### Finding inconsistent categories (not in category)

In [None]:
inconsistent_categories = set(study_data['blood_type').difference(categories['blood_type'])
print(inconsistent_categories)

In [None]:
# Get and print rows with inconsistent categories
inconsistent_rows = study_data['blood_type'].isin(inconsistent_categories)
study_data[inconsistent_rows]

### Dropping inconsistent categories

In [None]:
# Drop inconsistent categories and get consistent data only
consistent_data = study_data[~inconsistent_rows]

### Value consistency

In [None]:
# Capitalize
marriage_status[marriage_status'] = marriage_status[ 'marriage_status'].str.upper ()
marriage_status['marriage_status'].value_counts()
# UNMARRIED MARRIED

# Lowercase
marriage_status['marriage_status'] = marriage_status[ 'marriage_status '].str.lower()
marriage_status['marriage_status'].value_counts)

### Trailing spaces: married , " married', 'unmarried ', " unmarried'.

In [None]:
# Get marriage status column
marriage_status = demographics['marriage_status']
marriage_status.value_counts)

In [None]:
# Strip all spaces
demographics = demographics['marriage_status'].str.strip()
demographics| 'marriage_status'].value_counts()

### CREATE CATEGORY COLUMN

### Create categories out of data: income_group column from income column.

In [None]:
# Using cut) - create category ranges and names
ranges = [0,200000,500000,пр.inf]
group_names = ['0-200K', '200K-500K', '500K+']
# Create income group column
demographics['income_group'1 = pd.cut(demographics['household_income'], bins=ranges,
labels=group_names)
demographicsIl'income_group', 'household_income'11

### Map categories to fewer ones: reducing categories in categorical column. operating_system column is: 'Microsoft', 'Macos', 'IOS', 'Android', 'Linux' operating_system column should become: 'DesktopS', 'Mobile0S'

In [None]:
# Create mapping dictionary and replace
mapping = {'Microsoft': 'Desktop0S', 'MacOS': 'Desktop0S', 'Linux': 'Desktop0S',
'IOS': 'MobileOS', 'Android':'Mobile0S'}
devices['operating_system'] = devices['operating_system'].replace (mapping)
devices['operating_system'].unique()
array(I'Desktop0S', 'Mobile0S'], dtype=object)

In [None]:
# Print unique values of both columns
print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

# Lower dest_region column and then replace "eur" with "europe"
airlines['dest_region'] = airlines['dest_region'].str.lower() 
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})

# Remove white spaces from `dest_size`
airlines['dest_size'] = airlines['dest_size'].str.strip()

# Verify changes have been effected
print(airlines['dest_size'].unique())
print(airlines['dest_region'].unique())

### example

In [None]:
# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']

# Create wait_type column
airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges, 
                               labels = label_names)

# Create mappings and replace
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 
            'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}

airlines['day_week'] = airlines['day'].replace(mappings)
print(airlines)

In [None]:
        id        day        airline        destination    dest_region  ...     cleanliness         safety        satisfaction  wait_type day_week
0     1351    Tuesday    UNITED INTL             KANSAI           Asia  ...           Clean        Neutral      Very satisfied     medium  weekday
1      373     Friday         ALASKA  SAN JOSE DEL CABO  Canada/Mexico  ...           Clean      Very safe      Very satisfied     medium  weekday
2     2820   Thursday          DELTA        LOS ANGELES        West US  ...         Average  Somewhat safe             Neutral     medium  weekday
3     1157    Tuesday      SOUTHWEST        LOS ANGELES        West US  ...           Clean      Very safe  Somewhat satsified       long  weekday
4     2992  Wednesday       AMERICAN              MIAMI        East US  ...  Somewhat clean      Very safe  Somewhat satsified       long  weekday
...    ...        ...            ...                ...            ...  ...             ...            ...                 ...        ...      ...
2804  1475    Tuesday         ALASKA       NEW YORK-JFK        East US  ...  Somewhat clean        Neutral  Somewhat satsified       long  weekday
2805  2222   Thursday      SOUTHWEST            PHOENIX        West US  ...           Clean      Very safe      Very satisfied     medium  weekday
2806  2684     Friday         UNITED            ORLANDO        East US  ...           Clean      Very safe      Very satisfied     medium  weekday
2807  2549    Tuesday        JETBLUE         LONG BEACH        West US  ...           Clean  Somewhat safe      Very satisfied     medium  weekday
2808  2162   Saturday  CHINA EASTERN            QINGDAO           Asia  ...           Clean      Very safe  Somewhat satsified       long  weekend

[2477 rows x 14 columns]