In [1]:
import sys
sys.path.append('../')  # or the relative path from your notebook to your project root


In [2]:
import pandas as pd
from src.transform import DataCleaner

In [3]:
df = pd.read_csv("../data/processed/combined.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Date of sale              53 non-null     object
 1   Time of sale              53 non-null     object
 2   Date of listing           53 non-null     object
 3   Bundle                    53 non-null     object
 4   Buyer                     53 non-null     object
 5   Brand                     53 non-null     object
 6   Description               53 non-null     object
 7   Size                      53 non-null     object
 8   Item price                53 non-null     object
 9   Buyer shipping cost       53 non-null     object
 10  Total                     53 non-null     object
 11  USPS Cost                 53 non-null     object
 12  Depop fee                 53 non-null     object
 13  Depop Payments fee        53 non-null     object
 14  Boosting fee              53

In [5]:
states = df['State'].unique()
print(states)

['IA' 'Utah' 'AZ' 'CA' 'UT' 'OH' 'MS' 'TX' 'MA' 'California' 'IL' 'MO'
 'KS' 'NV' 'PA' 'NJ' 'Washington' 'CO' 'CT' 'DE' 'FL' 'Kansas' 'RI' 'WV'
 'NY' 'KY' 'NC' 'IN']


In [6]:
# Clean your data
cleaner = DataCleaner(df)
cleaned_df = (
    cleaner
    .drop_sensitive()
    .normalize_states()
    .convert_dates()
    .convert_numerics()
    .fill_sales_tax_by_state()
    .fill_columns_with_zero()
    .fill_missing()
    .drop_duplicates()
    .get_data()
)

# Preview and/or save
print(cleaned_df.head())
cleaned_df.to_csv("../data/processed/cleaned.csv", index=False)


Finished dropping sensitive columns.
Finished normalizing state names.
Finished converting date columns to datetime.
Finished converting numeric columns to float.
Filled missing 'US Sales tax' with state sales tax rates.
Finished filling refund columns with zero
Finished filling missing values.
Finished dropping duplicates.
  Date of sale Time of sale Date of listing Bundle           Brand  \
0   2024-07-01      7:16 PM      2024-06-16     No           Other   
1   2024-08-05     10:39 PM      2024-06-16     No         Starter   
2   2024-08-06      3:13 AM      2024-07-21     No           Other   
3   2024-08-07      4:42 AM      2024-07-21     No  Hard Rock Cafe   
4   2024-08-08      2:05 AM      2024-08-07     No           Other   

                                         Description Size  Item price  \
0  Limited NYC 2023 Dragon Ball Daima Special Pan...   XL        24.0   
1  90s Vintage Starter Denver Broncos Superbowl X...    L        25.0   
2  NYPD shirt\n\nSize: Medium\nMea

In [7]:
# Pandas to validate cleaned data
df = pd.read_csv("../data/processed/cleaned.csv")
print(df.head())
print(df.info())
print(df.isnull().sum())

  Date of sale Time of sale Date of listing Bundle           Brand  \
0   2024-07-01      7:16 PM      2024-06-16     No           Other   
1   2024-08-05     10:39 PM      2024-06-16     No         Starter   
2   2024-08-06      3:13 AM      2024-07-21     No           Other   
3   2024-08-07      4:42 AM      2024-07-21     No  Hard Rock Cafe   
4   2024-08-08      2:05 AM      2024-08-07     No           Other   

                                         Description Size  Item price  \
0  Limited NYC 2023 Dragon Ball Daima Special Pan...   XL        24.0   
1  90s Vintage Starter Denver Broncos Superbowl X...    L        25.0   
2  NYPD shirt\n\nSize: Medium\nMeasurements: 20 W...    M        10.0   
3  Hard Rock Cafe grey and white shirt\n\nSize: L...    L         8.0   
4                         Polo shirt\n\nSize: Medium    M         6.0   

   Buyer shipping cost  Total  ...  Boosting fee  Payment type     Category  \
0                 6.29  31.97  ...           0.0        STRIP