# Data Processing
### Goal: Transform raw data into a relational SQLite database with at least two linked tables

##### *Author - Srishti sahu*

In [22]:
import pandas as pd
import sqlite3
from pathlib import Path

raw_path = Path("../data/raw.csv")
df_raw = pd.read_csv(raw_path)

df_cleaned = df_raw.drop(columns=["Unnamed: 0", "Unnamed: 7", "Unnamed: 8"])
df_cleaned.columns = ["artist", "painting_name", "colourfulness", "brightness", "size", "people_near"]
df_cleaned.insert(0, "painting_id", range(1, len(df_cleaned) + 1))

attribute_entries = [
    # Size codes
    ("Size", "S", "Small"),
    ("Size", "M", "Medium"),
    ("Size", "L", "Large"),
    # Colourfulness codes (matching your column name)
    ("Colourfulness", "L", "Low"),
    ("Colourfulness", "M", "Medium"),
    ("Colourfulness", "H", "High"),
    # Brightness codes (added missing codes)
    ("Brightness", "Bright", "Bright"),
    ("Brightness", "Moderate", "Moderate"),
    ("Brightness", "Dull", "Dull"),
]

df_attributes = pd.DataFrame(attribute_entries, columns=["attribute_type", "code", "description"])

db_path = Path("../data/database.db")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS paintings;")
cursor.execute("DROP TABLE IF EXISTS attributes;")

cursor.execute("""
CREATE TABLE paintings (
    painting_id INTEGER PRIMARY KEY,
    artist TEXT,
    painting_name TEXT,
    colourfulness TEXT,
    brightness TEXT,
    size TEXT,
    people_near INTEGER
);
""")

cursor.execute("""
CREATE TABLE attributes (
    attribute_type TEXT,
    code TEXT,
    description TEXT,
    PRIMARY KEY (attribute_type, code)
);
""")

df_cleaned.to_sql("paintings", conn, if_exists="append", index=False)
df_attributes.to_sql("attributes", conn, if_exists="append", index=False)

conn.commit()
conn.close()

print("Database created successfully!")
print(f"Total paintings: {len(df_cleaned)}")
print(f"Total attribute mappings: {len(df_attributes)}")

Database created successfully!
Total paintings: 75
Total attribute mappings: 9


s10 - preview 

In [23]:
print("✅ Database with schema created at:", db_path)
display(df_cleaned.head())
display(df_attributes)

✅ Database with schema created at: ../data/database.db


Unnamed: 0,painting_id,artist,painting_name,colourfulness,brightness,size,people_near
0,1,Paolo veronese,Scorn,M,Dull,L,1
1,2,Paris Bordone,A pair of lovers,M,Bright,M,1
2,3,Paolo veronese,The family of Darius before Alexander,H,Moderate,L,5
3,4,Jacopo Tintoretto,The origin of Milky Way,H,Bright,L,3
4,5,Jacopo Tintoretto,Saint George and the dragon,H,Moderate,M,2


Unnamed: 0,attribute_type,code,description
0,Size,S,Small
1,Size,M,Medium
2,Size,L,Large
3,Colourfulness,L,Low
4,Colourfulness,M,Medium
5,Colourfulness,H,High
6,Brightness,Bright,Bright
7,Brightness,Moderate,Moderate
8,Brightness,Dull,Dull
