# Python for DE - Transforming Data

In [1]:
print(
    "------------------------------------------------------------------------------"
)
print("Use standard python libraries to do the transformations")
print(
    "------------------------------------------------------------------------------"
)

------------------------------------------------------------------------------
Use standard python libraries to do the transformations
------------------------------------------------------------------------------


In [2]:
# Question: How do you read data from a CSV file at ./data/sample_data.csv into a list of dictionaries?
import csv
input_csv = "sample_data.csv"

with open(input_csv, "r", encoding="utf-8") as f:
    csv_reader = csv.DictReader(f)
    
    # Ensure headers exist
    headers = csv_reader.fieldnames
    if headers is None:
        raise ValueError("CSV file has no headers!")

    data = [row for row in csv_reader]

data

[{'Customer_ID': '1',
  'Customer_Name': 'Henry Jones',
  'Age': '32',
  'Gender': 'Male',
  'Purchase_Amount': '1080000.66',
  'Purchase_Date': '2023-08-15'},
 {'Customer_ID': '2',
  'Customer_Name': 'Emma Rodriguez',
  'Age': '24',
  'Gender': 'Male',
  'Purchase_Amount': '62.4',
  'Purchase_Date': '2024-04-16'},
 {'Customer_ID': '3',
  'Customer_Name': 'Frank Martinez',
  'Age': '20',
  'Gender': 'Female',
  'Purchase_Amount': '443.47',
  'Purchase_Date': '2024-05-16'},
 {'Customer_ID': '4',
  'Customer_Name': 'Alice Rodriguez',
  'Age': '62',
  'Gender': 'Female',
  'Purchase_Amount': '729.69',
  'Purchase_Date': '2024-01-05'},
 {'Customer_ID': '5',
  'Customer_Name': 'Frank Miller',
  'Age': '33',
  'Gender': 'Female',
  'Purchase_Amount': '651.2',
  'Purchase_Date': '2024-05-23'},
 {'Customer_ID': '6',
  'Customer_Name': 'Emma Garcia',
  'Age': '22',
  'Gender': 'Female',
  'Purchase_Amount': '477.56',
  'Purchase_Date': '2023-09-02'},
 {'Customer_ID': '7',
  'Customer_Name': 'Gr

In [3]:
# Question: How do you remove duplicate rows based on customer ID?
seen_ids = set()
filtered_data = []

for row in data:
    if row["Customer_ID"] not in seen_ids:
        seen_ids.add(row["Customer_ID"])
        filtered_data.append(row)

filtered_data

[{'Customer_ID': '1',
  'Customer_Name': 'Henry Jones',
  'Age': '32',
  'Gender': 'Male',
  'Purchase_Amount': '1080000.66',
  'Purchase_Date': '2023-08-15'},
 {'Customer_ID': '2',
  'Customer_Name': 'Emma Rodriguez',
  'Age': '24',
  'Gender': 'Male',
  'Purchase_Amount': '62.4',
  'Purchase_Date': '2024-04-16'},
 {'Customer_ID': '3',
  'Customer_Name': 'Frank Martinez',
  'Age': '20',
  'Gender': 'Female',
  'Purchase_Amount': '443.47',
  'Purchase_Date': '2024-05-16'},
 {'Customer_ID': '4',
  'Customer_Name': 'Alice Rodriguez',
  'Age': '62',
  'Gender': 'Female',
  'Purchase_Amount': '729.69',
  'Purchase_Date': '2024-01-05'},
 {'Customer_ID': '5',
  'Customer_Name': 'Frank Miller',
  'Age': '33',
  'Gender': 'Female',
  'Purchase_Amount': '651.2',
  'Purchase_Date': '2024-05-23'},
 {'Customer_ID': '6',
  'Customer_Name': 'Emma Garcia',
  'Age': '22',
  'Gender': 'Female',
  'Purchase_Amount': '477.56',
  'Purchase_Date': '2023-09-02'},
 {'Customer_ID': '7',
  'Customer_Name': 'Gr

In [4]:
# Question: How do you handle missing values by replacing them with 0?
for row in data:
    for key in row.keys():
        if row[key] in ['', None]:  # Handles both empty strings and None values
            row[key] = 0
            print(f"Edited row: {row}")

Edited row: {'Customer_ID': '99', 'Customer_Name': 'Alice Johnson', 'Age': 0, 'Gender': 'Female', 'Purchase_Amount': '781.83', 'Purchase_Date': '2023-06-04'}
Edited row: {'Customer_ID': '100', 'Customer_Name': 'Jack Garcia', 'Age': 0, 'Gender': 'Female', 'Purchase_Amount': '269.64', 'Purchase_Date': '2024-03-08'}


In [5]:
# Question: How do you remove outliers such as age > 100 or purchase amount > 1000?
data = [row for row in data if int(row['Age']) <= 100 and float(row['Purchase_Amount']) <= 1000]

In [6]:
# Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)?
for row in data:
    if row['Gender'] == 'Female':
        row['Gender'] = 0
    else:
        row['Gender'] = 1

data[:5]

[{'Customer_ID': '2',
  'Customer_Name': 'Emma Rodriguez',
  'Age': '24',
  'Gender': 1,
  'Purchase_Amount': '62.4',
  'Purchase_Date': '2024-04-16'},
 {'Customer_ID': '3',
  'Customer_Name': 'Frank Martinez',
  'Age': '20',
  'Gender': 0,
  'Purchase_Amount': '443.47',
  'Purchase_Date': '2024-05-16'},
 {'Customer_ID': '4',
  'Customer_Name': 'Alice Rodriguez',
  'Age': '62',
  'Gender': 0,
  'Purchase_Amount': '729.69',
  'Purchase_Date': '2024-01-05'},
 {'Customer_ID': '5',
  'Customer_Name': 'Frank Miller',
  'Age': '33',
  'Gender': 0,
  'Purchase_Amount': '651.2',
  'Purchase_Date': '2024-05-23'},
 {'Customer_ID': '6',
  'Customer_Name': 'Emma Garcia',
  'Age': '22',
  'Gender': 0,
  'Purchase_Amount': '477.56',
  'Purchase_Date': '2023-09-02'}]

In [7]:
# Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns?
for row in data:
    name_parts = row['Customer_Name'].split()
    
    row['First_Name'] = name_parts[0]  # Always exists
    row['Last_Name'] = ' '.join(name_parts[1:]) if len(name_parts) > 1 else ''  # Join remaining parts or set empty
    
    del row['Customer_Name']

data[:3]

[{'Customer_ID': '2',
  'Age': '24',
  'Gender': 1,
  'Purchase_Amount': '62.4',
  'Purchase_Date': '2024-04-16',
  'First_Name': 'Emma',
  'Last_Name': 'Rodriguez'},
 {'Customer_ID': '3',
  'Age': '20',
  'Gender': 0,
  'Purchase_Amount': '443.47',
  'Purchase_Date': '2024-05-16',
  'First_Name': 'Frank',
  'Last_Name': 'Martinez'},
 {'Customer_ID': '4',
  'Age': '62',
  'Gender': 0,
  'Purchase_Amount': '729.69',
  'Purchase_Date': '2024-01-05',
  'First_Name': 'Alice',
  'Last_Name': 'Rodriguez'}]

In [8]:
# Question: How do you calculate the total purchase amount by Gender?
amount_by_gender = {
                    'male':0.0, 
                   'female':0.0
                   }
for row in data:
    purchase_amount = float(row['Purchase_Amount'])  # Convert once
    if row['Gender'] == 1: # male
        amount_by_gender['male'] += purchase_amount
    else:
        amount_by_gender['female'] += purchase_amount

print(f"Purchase Amount by Gender: {amount_by_gender}")

Purchase Amount by Gender: {'male': 26924.760000000002, 'female': 31251.170000000002}


In [9]:
# Question: How do you calculate the average purchase amount by Age group?
# assume age_groups is the grouping we want
# hint: Why do we convert to float?
age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []}
for key in age_groups.keys():
    start_age, end_age = float(key.split('-')[0]), float(key.split('-')[1]) 
    for row in data:
        age = float(row['Age'])
        purchase_amount = float(row['Purchase_Amount'])
        # Check if the customer's age is within the range
        if start_age <= age <= end_age:
            age_groups[key].append(purchase_amount)  # Add purchase amount to the group

# Step 2: Calculate the average purchase amount for each age group
for key, purchases in age_groups.items():
    if purchases:  # Check if the list is not empty
        avg_purchase_amount = sum(purchases) / len(purchases)
        age_groups[key] = avg_purchase_amount
    else:
        age_groups[key] = 0  # No customers in this age group

print(age_groups)

{'18-30': 580.6584848484849, '31-40': 574.7383333333332, '41-50': 493.946, '51-60': 489.58444444444444, '61-70': 557.875}


In [10]:
# Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group?
your_total_purchase_amount_by_gender = amount_by_gender # your results should be assigned to this variable
average_purchase_by_age_group = age_groups # your results should be assigned to this variable

print(f"Total purchase amount by Gender: {your_total_purchase_amount_by_gender}")
print(f"Average purchase amount by Age group: {average_purchase_by_age_group}")

Total purchase amount by Gender: {'male': 26924.760000000002, 'female': 31251.170000000002}
Average purchase amount by Age group: {'18-30': 580.6584848484849, '31-40': 574.7383333333332, '41-50': 493.946, '51-60': 489.58444444444444, '61-70': 557.875}


In [11]:
print(
    "------------------------------------------------------------------------------"
)
print("Use DuckDB to do the transformations")
print(
    "------------------------------------------------------------------------------"
)

------------------------------------------------------------------------------
Use DuckDB to do the transformations
------------------------------------------------------------------------------


In [44]:
# Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table?
# Connect to DuckDB and load data
import csv
import duckdb

# CSV file path
input_csv = "sample_data.csv"

# Connect to DuckDB
duckdb_conn = duckdb.connect(database=":memory:", read_only=False)

duckdb_conn.execute("""
                    CREATE TABLE IF NOT EXISTS 
                    data
                    (Customer_ID INTEGER,
                    Customer_Name VARCHAR,
                    Age INTEGER,
                    Gender VARCHAR,
                    Purchase_amount FLOAT,
                    Purchase_date DATE
                    )
                    """)
duckdb_conn.execute(f"COPY data FROM {input_csv} WITH HEADER CSV")

duckdb_conn.execute("SELECT * FROM data").fetchall()

[(1, 'Henry Jones', 32, 'Male', 1080000.625, datetime.date(2023, 8, 15)),
 (2,
  'Emma Rodriguez',
  24,
  'Male',
  62.400001525878906,
  datetime.date(2024, 4, 16)),
 (3,
  'Frank Martinez',
  20,
  'Female',
  443.4700012207031,
  datetime.date(2024, 5, 16)),
 (4,
  'Alice Rodriguez',
  62,
  'Female',
  729.6900024414062,
  datetime.date(2024, 1, 5)),
 (5,
  'Frank Miller',
  33,
  'Female',
  651.2000122070312,
  datetime.date(2024, 5, 23)),
 (6,
  'Emma Garcia',
  22,
  'Female',
  477.55999755859375,
  datetime.date(2023, 9, 2)),
 (7,
  'Grace Jones',
  53,
  'Female',
  29.719999313354492,
  datetime.date(2023, 10, 1)),
 (8, 'Henry Smith', 41, 'Male', 291.2799987792969, datetime.date(2024, 2, 1)),
 (9,
  'Emma Jones',
  50,
  'Female',
  575.6900024414062,
  datetime.date(2023, 12, 11)),
 (10,
  'Ivy Martinez',
  44,
  'Female',
  30.959999084472656,
  datetime.date(2024, 5, 11)),
 (11,
  'Grace Miller',
  29,
  'Male',
  321.4200134277344,
  datetime.date(2023, 10, 25)),
 (12,

In [45]:
# Question: How do you remove duplicate rows based on customer ID in DuckDB?
duckdb_conn.execute("CREATE TABLE IF NOT EXISTS data_unique AS SELECT DISTINCT * FROM data")

duckdb_conn.execute("SELECT * FROM data_unique").fetchall()

[(29,
  'Charlie Garcia',
  42,
  'Female',
  696.1900024414062,
  datetime.date(2023, 7, 26)),
 (53,
  'Alice Miller',
  31,
  'Female',
  396.19000244140625,
  datetime.date(2023, 9, 17)),
 (78,
  'Charlie Miller',
  24,
  'Male',
  510.70001220703125,
  datetime.date(2024, 2, 4)),
 (87,
  'Henry Johnson',
  27,
  'Female',
  586.8200073242188,
  datetime.date(2023, 8, 2)),
 (100,
  'Jack Garcia',
  None,
  'Female',
  269.6400146484375,
  datetime.date(2024, 3, 8)),
 (13,
  'Charlie Jones',
  24,
  'Female',
  330.8699951171875,
  datetime.date(2023, 11, 11)),
 (35,
  'Grace Williams',
  58,
  'Male',
  139.00999450683594,
  datetime.date(2024, 4, 23)),
 (37, 'Ivy Miller', 37, 'Male', 787.0599975585938, datetime.date(2024, 5, 4)),
 (60,
  'David Davis',
  42,
  'Male',
  698.8200073242188,
  datetime.date(2023, 11, 21)),
 (65,
  'Bob Garcia',
  32,
  'Female',
  726.8699951171875,
  datetime.date(2024, 1, 6)),
 (66,
  'Alice Davis',
  68,
  'Male',
  377.1099853515625,
  datetime.da

In [46]:
# Question: How do you handle missing values by replacing them with 0 in DuckDB?
duckdb_conn.execute("""
                    CREATE TABLE data_unique_cleaned AS
                    SELECT
                        Customer_ID, 
                        Customer_Name,
                        CAST(COALESCE(CAST(Age AS INT), 0) AS INT) AS Age,
                        Gender,
                        CAST(COALESCE(CAST(Purchase_amount AS FLOAT), 0.0) AS FLOAT) AS Purchase_amount,
                        Purchase_date
                    FROM data_unique;
                    """)

<duckdb.duckdb.DuckDBPyConnection at 0x215feab1af0>

In [53]:
# Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB?
duckdb_conn.execute("""
                    CREATE TABLE IF NOT EXISTS data_cleaned_outliers AS
                    SELECT
                    *
                    FROM data_unique_cleaned
                    WHERE age <= 100
                    AND purchase_amount <= 1000;
                    """)

duckdb_conn.execute("SELECT * FROM data_cleaned_outliers LIMIT 5").fetchall()

[(29,
  'Charlie Garcia',
  42,
  '0',
  696.1900024414062,
  datetime.date(2023, 7, 26)),
 (53, 'Alice Miller', 31, '0', 396.19000244140625, datetime.date(2023, 9, 17)),
 (78,
  'Charlie Miller',
  24,
  '1',
  510.70001220703125,
  datetime.date(2024, 2, 4)),
 (87, 'Henry Johnson', 27, '0', 586.8200073242188, datetime.date(2023, 8, 2)),
 (100, 'Jack Garcia', 0, '0', 269.6400146484375, datetime.date(2024, 3, 8))]

In [54]:
# Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB?
query_1 = """
        UPDATE data_cleaned_outliers
        SET Gender = 0
        WHERE Gender = 'Female';
            """

query_2 = """
        UPDATE data_cleaned_outliers
        SET Gender = 1
        WHERE Gender = 'Male';
            """

duckdb_conn.execute(query_1)
duckdb_conn.execute(query_2)

duckdb_conn.execute("SELECT * FROM data_cleaned_outliers LIMIT 5").fetchall()

[(29,
  'Charlie Garcia',
  42,
  '0',
  696.1900024414062,
  datetime.date(2023, 7, 26)),
 (53, 'Alice Miller', 31, '0', 396.19000244140625, datetime.date(2023, 9, 17)),
 (78,
  'Charlie Miller',
  24,
  '1',
  510.70001220703125,
  datetime.date(2024, 2, 4)),
 (87, 'Henry Johnson', 27, '0', 586.8200073242188, datetime.date(2023, 8, 2)),
 (100, 'Jack Garcia', 0, '0', 269.6400146484375, datetime.date(2024, 3, 8))]

In [55]:
# Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB?
#duckdb_conn.execute("INSERT INTO data_cleaned_outliers VALUES (110, 'Joyan', 25, 1, 580.9, '2024-03-31')")
query = """
        SELECT
        Customer_ID, 
        split_part(Customer_Name,' ', 1) as First_Name,
        CASE
        WHEN split_part(Customer_Name,' ', 2) = ''
        THEN NULL
        ELSE split_part(Customer_Name,' ', 2) 
        END as Last_Name,
        Age,
        Gender,
        Purchase_amount,
        Purchase_date
        FROM 
        data_cleaned_outliers
        """
duckdb_conn.execute(f"CREATE TABLE IF NOT EXISTS data_transformed AS {query}")

<duckdb.duckdb.DuckDBPyConnection at 0x215feab1af0>

In [56]:
# Question: How do you calculate the total purchase amount by Gender in DuckDB?
query = """ 
        SELECT
        Gender,
        SUM(Purchase_amount) as total_purchase_amount
        FROM data_transformed
        GROUP BY gender
        """
duckdb_conn.execute(query).fetchall()

[('0', 28215.77996635437), ('1', 1104600.5150527954)]

In [57]:
# Question: How do you calculate the average purchase amount by Age group in DuckDB?
average_purchase_by_age_group = duckdb_conn.execute(
    "SELECT CASE \
                                             WHEN Age BETWEEN 18 AND 30 THEN '18-30' \
                                             WHEN Age BETWEEN 31 AND 40 THEN '31-40' \
                                             WHEN Age BETWEEN 41 AND 50 THEN '41-50' \
                                             WHEN Age BETWEEN 51 AND 60 THEN '51-60' \
                                             ELSE '61-70' END AS Age_Group, \
                                             AVG(Purchase_Amount) AS Average_Purchase_Amount \
                                             FROM data_transformed \
                                             GROUP BY Age_Group"
).fetchall()
average_purchase_by_age_group

[('41-50', 493.946000289917),
 ('51-60', 494.51882250168745),
 ('18-30', 570.8131050899111),
 ('61-70', 533.576874256134),
 ('31-40', 60524.43027750651)]