Créer les dimensions

In [1]:
import sqlite3
import pandas as pd

# Connexion DB
conn = sqlite3.connect("../data/processed/airbnb_analysis.db")

# Charger dataset propre
df = pd.read_csv("../data/processed/listings_clean.csv")

# ---------------------------
# 1️⃣ Dimension City
# ---------------------------
dim_city = df[["city"]].drop_duplicates().reset_index(drop=True)
dim_city["city_id"] = dim_city.index + 1

dim_city.to_sql("dim_city", conn, if_exists="replace", index=False)

# ---------------------------
# 2️⃣ Dimension Room Type
# ---------------------------
dim_room = df[["room_type"]].drop_duplicates().reset_index(drop=True)
dim_room["room_type_id"] = dim_room.index + 1

dim_room.to_sql("dim_room_type", conn, if_exists="replace", index=False)

# ---------------------------
# 3️⃣ Dimension Neighbourhood
# ---------------------------
dim_neigh = df[["neighbourhood", "neighbourhood_group"]].drop_duplicates().reset_index(drop=True)
dim_neigh["neighbourhood_id"] = dim_neigh.index + 1

dim_neigh.to_sql("dim_neighbourhood", conn, if_exists="replace", index=False)

print("Dimensions créées.")


Dimensions créées.


Créer la table FACT

In [2]:
# Merge pour récupérer les ID

df = df.merge(dim_city, on="city")
df = df.merge(dim_room, on="room_type")
df = df.merge(dim_neigh, on=["neighbourhood", "neighbourhood_group"])

fact_listings = df[[
    "id",
    "city_id",
    "room_type_id",
    "neighbourhood_id",
    "price",
    "availability_365",
    "minimum_nights",
    "number_of_reviews",
    "latitude",
    "longitude"
]]

fact_listings.rename(columns={"id": "listing_id"}, inplace=True)

fact_listings.to_sql("fact_listings", conn, if_exists="replace", index=False)

print("Table fact_listings créée.")


Table fact_listings créée.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_listings.rename(columns={"id": "listing_id"}, inplace=True)


In [3]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)


Unnamed: 0,name
0,listings
1,dim_city
2,dim_room_type
3,dim_neighbourhood
4,fact_listings


Vérifier que la base + schéma étoile sont OK

1) Lister les tables existantes

In [4]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("../data/processed/airbnb_analysis.db")

pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", conn)


Unnamed: 0,name
0,dim_city
1,dim_neighbourhood
2,dim_room_type
3,fact_listings
4,listings


Vérifier le nombre de lignes de chaque table

In [5]:
tables = ["listings", "dim_city", "dim_room_type", "dim_neighbourhood", "fact_listings"]

for t in tables:
    try:
        n = pd.read_sql(f"SELECT COUNT(*) AS n FROM {t};", conn)["n"][0]
        print(t, "=>", n)
    except Exception as e:
        print(t, "=> NOT FOUND / ERROR:", e)


listings => 33887
dim_city => 2
dim_room_type => 4
dim_neighbourhood => 199
fact_listings => 33887


3) Vérifier une jointure fact ↔ dims

Afficher les colonnes exactes de dim_city

In [7]:
import pandas as pd

pd.read_sql("PRAGMA table_info(dim_city);", conn)


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,city,TEXT,0,,0
1,1,city_id,INTEGER,0,,0


Afficher aussi les colonnes de dim_room_type

In [8]:
pd.read_sql("PRAGMA table_info(dim_room_type);", conn)


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,room_type,TEXT,0,,0
1,1,room_type_id,INTEGER,0,,0


In [9]:
query = """
SELECT 
    f.price,
    c.city,
    r.room_type
FROM fact_listings f
JOIN dim_city c 
    ON f.city_id = c.city_id
JOIN dim_room_type r 
    ON f.room_type_id = r.room_type_id
LIMIT 10;
"""

pd.read_sql(query, conn)


Unnamed: 0,price,city,room_type
0,157.0,Madrid,Entire home/apt
1,143.0,Madrid,Entire home/apt
2,65.0,Madrid,Private room
3,116.0,Madrid,Entire home/apt
4,79.0,Madrid,Entire home/apt
5,300.0,Madrid,Entire home/apt
6,166.0,Madrid,Entire home/apt
7,45.0,Madrid,Private room
8,24.0,Madrid,Private room
9,90.0,Madrid,Private room
