In [2]:
import pandas as pd
import json
import hashlib

# Load dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"
df = pd.read_csv(url)

# Basic preview
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


(   total_bill   tip     sex smoker  day    time  size
 0       16.99  1.01  Female     No  Sun  Dinner     2
 1       10.34  1.66    Male     No  Sun  Dinner     3
 2       21.01  3.50    Male     No  Sun  Dinner     3
 3       23.68  3.31    Male     No  Sun  Dinner     2
 4       24.59  3.61  Female     No  Sun  Dinner     4,
 None)

In [4]:
# Basic data quality checks

# 1) Check for missing values
missing_summary = df.isnull().sum()

# 2) Validate numeric columns are >= 0
numeric_cols = ["total_bill", "tip", "size"]
numeric_validity = {col: (df[col] >= 0).all() for col in numeric_cols}

# 3) Validate categorical column values
valid_sex_values = set(["Male", "Female"])
sex_column_valid = df["sex"].isin(valid_sex_values).all()

# Print results
print("Missing Values per Column:")
print(missing_summary, "\n")

print("Numeric Columns Validity (>= 0):")
print(numeric_validity, "\n")

print("Sex Column Valid:", sex_column_valid)


Missing Values per Column:
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64 

Numeric Columns Validity (>= 0):
{'total_bill': np.True_, 'tip': np.True_, 'size': np.True_} 

Sex Column Valid: True


In [5]:
import json, hashlib

lineage_record = {
    "source_url": url,
    "rows": len(df),
    "columns": list(df.columns),
    "hash": hashlib.md5(df.to_json().encode()).hexdigest()
}

with open("lineage_log.json", "w") as f:
    json.dump(lineage_record, f, indent=2)
print("Lineage logged.")


Lineage logged.


In [6]:
clean_df = df.dropna().drop_duplicates()
clean_df.to_csv("clean_tips_data.csv", index=False)
print("Clean dataset exported.")


Clean dataset exported.
