In [None]:
from ipywidgets import interact 
import ipywidgets as widgets
from sqlalchemy import create_engine

%load_ext sql

In [None]:
engine = create_engine('postgresql://postgres:pgadmin@localhost/postgres')
%sql postgresql://postgres:pgadmin@localhost/postgres

In [None]:
%config SqlMagic.autocommit=False

In [None]:
%%sql 
COMMIT;
DROP DATABASE IF EXISTS airbnb_icmc_2 WITH (FORCE);
COMMIT;
CREATE DATABASE airbnb_icmc_2
    WITH OWNER = postgres
    ENCODING = 'UTF8';
COMMIT;

In [None]:
%config SqlMagic.autocommit=True

In [None]:
engine = create_engine('postgresql://postgres:pgadmin@localhost/airbnb_icmc_2')
%sql postgresql://postgres:pgadmin@localhost/airbnb_icmc_2

In [None]:
%%sql
DROP TABLE IF EXISTS Listings, Reviews, Calendar CASCADE;

# Inserindo dados nas tabelas do airbnb

In [None]:
%%sql
DROP TYPE IF EXISTS bool_sigla CASCADE;
CREATE TYPE bool_sigla AS ENUM {'t', 'f'}

In [None]:
%%sql
DROP TABLE IF EXISTS Listings CASCADE;
CREATE TABLE Listings (
    id BIGINT PRIMARY KEY,
    listing_url TEXT UNIQUE NOT NULL,
    scrape_id BIGINT NOT NULL, -- erro ao ser unique
    last_scraped DATE,
    source TEXT,
    name TEXT,
    description TEXT,
    neighborhood_overview TEXT,
    picture_url TEXT,
    host_id BIGINT NOT NULL, -- erro ao ser unique
    host_url TEXT NOT NULL, -- erro ao ser unique
    host_name TEXT,
    host_since DATE,
    host_location TEXT,
    host_about TEXT,
    host_response_time TEXT,
    host_response_rate TEXT,
    host_acceptance_rate TEXT,
    host_is_superhost bool_sigla,
    host_thumbnail_url TEXT,
    host_picture_url TEXT,
    host_neighbourhood TEXT,
    host_listings_count INT,
    host_total_listings_count INT,
    host_verifications TEXT,
    host_has_profile_pic BOOLEAN,
    host_identity_verified BOOLEAN,
    neighbourhood TEXT,
    neighbourhood_cleansed TEXT,
    neighbourhood_group_cleansed TEXT,
    latitude DOUBLE PRECISION,
    longitude DOUBLE PRECISION,
    property_type TEXT,
    room_type TEXT,
    accommodates INT,
    bathrooms INT,
    bathrooms_text TEXT,
    bedrooms INT,
    beds INT,
    amenities TEXT,
    price TEXT,
    minimum_nights INT,
    maximum_nights INT,
    minimum_minimum_nights INT,
    maximum_minimum_nights INT,
    minimum_maximum_nights INT,
    maximum_maximum_nights INT,
    minimum_nights_avg_ntm DOUBLE PRECISION,
    maximum_nights_avg_ntm DOUBLE PRECISION,
    calendar_updated TEXT,
    has_availability bool_sigla,
    availability_30 INT,
    availability_60 INT,
    availability_90 INT,
    availability_365 INT,
    calendar_last_scraped DATE,
    number_of_reviews INT,
    number_of_reviews_ltm INT,
    number_of_reviews_l30d INT,
    first_review DATE,
    last_review DATE,
    review_scores_rating DOUBLE PRECISION,
    review_scores_accuracy DOUBLE PRECISION,
    review_scores_cleanliness DOUBLE PRECISION,
    review_scores_checkin DOUBLE PRECISION,
    review_scores_communication DOUBLE PRECISION,
    review_scores_location DOUBLE PRECISION,
    review_scores_value DOUBLE PRECISION,
    license TEXT,
    instant_bookable bool_sigla,
    calculated_host_listings_count INT,
    calculated_host_listings_count_entire_homes INT,
    calculated_host_listings_count_private_rooms INT,
    calculated_host_listings_count_shared_rooms INT,
    reviews_per_month DOUBLE PRECISION
);

In [None]:
%%sql
DROP TABLE IF EXISTS Reviews CASCADE;
CREATE TABLE Reviews (
    id BIGINT NOT NULL,
    listing_id BIGINT,
    date DATE,
    reviewer_id BIGINT,
    reviewer_name TEXT,
    comments TEXT
);

In [None]:
%%sql
DROP TABLE IF EXISTS Calendar CASCADE;
CREATE TABLE Calendar (
    listing_id BIGINT,
    date DATE,
    available bool_sigla,
    price TEXT,
    adjusted_price TEXT,
    minimum_nights INT,
    maximum_nights INT,
    PRIMARY KEY (listing_id, date),
    FOREIGN KEY (listing_id) REFERENCES Listings(id)
);

In [None]:
!psql -c "\copy Listings FROM './listings.csv' WITH (FORMAT CSV, DELIMITER ',', ENCODING 'UTF8', NULL '', QUOTE '\"', HEADER true);" -U postgres -d airbnb_icmc_2

In [None]:
!psql -c "\copy Reviews FROM './reviews.csv' WITH (FORMAT CSV, DELIMITER ',', ENCODING 'UTF8', NULL '', QUOTE '\"', HEADER true);" -U postgres -d airbnb_icmc_2

In [None]:
!psql -c "\copy Calendar FROM './calendar.csv' WITH (FORMAT CSV, DELIMITER ',', ENCODING 'UTF8', NULL '', QUOTE '\"', HEADER true);" -U postgres -d airbnb_icmc_2

# Normalizando as tabelas do airbnb

In [151]:
%%sql
DROP TABLE IF EXISTS Listings_norm CASCADE;
CREATE TABLE Listings_norm AS (
    SELECT
        id,
        listing_url,
        name,
        description,
        neighborhood_overview,
        picture_url,
        neighbourhood,
        neighbourhood_cleansed,
        neighbourhood_group_cleansed,
        latitude,
        longitude,
        property_type,
        room_type,
        accommodates,
        bathrooms,
        bathrooms_text,
        bedrooms,
        beds,
        price,
        minimum_nights,
        maximum_nights,
        minimum_minimum_nights,
        maximum_minimum_nights,
        minimum_maximum_nights,
        maximum_maximum_nights,
        minimum_nights_avg_ntm,
        maximum_nights_avg_ntm,
        calendar_updated,
        has_availability,
        availability_30,
        availability_60,
        availability_90,
        availability_365,
        calendar_last_scraped,
        number_of_reviews,
        number_of_reviews_ltm,
        number_of_reviews_l30d,
        first_review,
        last_review,
        review_scores_rating,
        review_scores_accuracy,
        review_scores_cleanliness,
        review_scores_checkin,
        review_scores_communication,
        review_scores_location,
        review_scores_value,
        license,
        instant_bookable,
        reviews_per_month
    FROM Listings
)
WITH NO DATA;

ALTER TABLE Listings_norm
    ADD PRIMARY KEY (id),
    ADD CONSTRAINT listing_url_unique UNIQUE (listing_url),
    ALTER COLUMN listing_url SET NOT NULL
;

INSERT INTO Listings_norm
    SELECT 
        id,
        listing_url,
        name,
        description,
        neighborhood_overview,
        picture_url,
        neighbourhood,
        neighbourhood_cleansed,
        neighbourhood_group_cleansed,
        latitude,
        longitude,
        property_type,
        room_type,
        accommodates,
        bathrooms,
        bathrooms_text,
        bedrooms,
        beds,
        price,
        minimum_nights,
        maximum_nights,
        minimum_minimum_nights,
        maximum_minimum_nights,
        minimum_maximum_nights,
        maximum_maximum_nights,
        minimum_nights_avg_ntm,
        maximum_nights_avg_ntm,
        calendar_updated,
        has_availability,
        availability_30,
        availability_60,
        availability_90,
        availability_365,
        calendar_last_scraped,
        number_of_reviews,
        number_of_reviews_ltm,
        number_of_reviews_l30d,
        first_review,
        last_review,
        review_scores_rating,
        review_scores_accuracy,
        review_scores_cleanliness,
        review_scores_checkin,
        review_scores_communication,
        review_scores_location,
        review_scores_value,
        license,
        instant_bookable,
        reviews_per_month
    FROM Listings
;

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
36008 rows affected.


[]

In [129]:
%%sql
SELECT *
FROM Listings_norm
LIMIT 3

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
3 rows affected.


id,listing_url,name,description,neighborhood_overview,picture_url,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,reviews_per_month
17878,https://www.airbnb.com/rooms/17878,Condo in Rio de Janeiro · ★4.70 · 2 bedrooms · 2 beds · 1 bath,,"This is the one of the bests spots in Rio. Because of the large balcony and proximity to the beach, it has huge advantages in the current situation.",https://a0.muscache.com/pictures/65320518/30698f38_original.jpg,"Rio de Janeiro, Brazil",Copacabana,,-22.96599,-43.1794,Entire condo,Entire home/apt,5,,1 bath,,2,"$1,357.00",5,28,5,5,28,28,5.0,28.0,,t,5,7,14,269,2023-12-27,311,29,4,2010-07-15,2023-12-22,4.7,4.77,4.65,4.83,4.91,4.77,4.67,,f,1.9
25026,https://www.airbnb.com/rooms/25026,Rental unit in Rio de Janeiro · ★4.72 · 1 bedroom · 1 bed · 1 bath,,"Copacabana is a lively neighborhood and the apartment is located very close to an area in Copa full of bars, cafes and restaurants at Rua Bolivar and Domingos Ferreira. Copacabana never sleeps, there is always movement and it's a great mix of all kinds of people.",https://a0.muscache.com/pictures/a745aa21-b8dd-4959-a040-eb8e6e6f07ee.jpg,"Rio de Janeiro, Brazil",Copacabana,,-22.97735,-43.19105,Entire rental unit,Entire home/apt,3,,1 bath,,1,$865.00,2,60,2,4,60,60,2.2,60.0,,t,3,18,48,228,2023-12-27,275,29,2,2010-06-07,2023-12-03,4.72,4.7,4.79,4.81,4.92,4.84,4.6,,f,1.67
35764,https://www.airbnb.com/rooms/35764,Loft in Rio de Janeiro · ★4.90 · 1 bedroom · 1 bed · 1.5 baths,,"Our guests will experience living with a local peole ""Carioca"" in a very friendly building with 24 hours a day security with all kind of stores, banks, transports, restaurants.",https://a0.muscache.com/pictures/23782972/1d3e55b0_original.jpg,"Rio de Janeiro, Brazil",Copacabana,,-22.98107,-43.19136,Entire loft,Entire home/apt,2,,1.5 baths,,1,$373.00,3,15,1,6,7,15,3.1,14.7,,t,4,9,12,62,2023-12-27,454,36,2,2010-10-03,2023-12-17,4.9,4.93,4.93,4.97,4.95,4.94,4.89,,f,2.82


In [154]:
%%sql
DROP TABLE IF EXISTS Scrape CASCADE;
CREATE TABLE Scrape AS (
    SELECT
        scrape_id,
        source,
        last_scraped,
        calendar_last_scraped
    FROM Listings
)
WITH NO DATA;

-- existe apenas 1 scrape id, entao perderiamos muita informacao
-- se colocarmos restricao de chave primaria
ALTER TABLE Scrape
    ALTER COLUMN scrape_id SET NOT NULL
;

INSERT INTO Scrape
    SELECT
        scrape_id,
        source,
        last_scraped,
        calendar_last_scraped
    FROM Listings
; 

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
36008 rows affected.


[]

In [153]:
%%sql
SELECT *
FROM Scrape
LIMIT 3

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
1 rows affected.


scrape_id,source,last_scraped,calendar_last_scraped
20231226034138,city scrape,2023-12-27,2023-12-27


In [None]:
-- Dados multivalorados, utilizar funcoes

%%sql
DROP TABLE IF EXISTS Amenities CASCADE;
CREATE TABLE Amenities AS (
    SELECT
        listing_id,
        name_amenity,
        quantity
    FROM Listings
);

In [None]:
%%sql
SELECT *
FROM Amenities
LIMIT 5

In [156]:
-- Estamos utilizando a restricao de chave para nao obtermos dados distorcidos nas consultas,
-- dessa forma, estamos descartando as tuplas que possuem host_id que ja foi inserido

%%sql
DROP TABLE IF EXISTS Host CASCADE;
CREATE TABLE Host AS (
    SELECT
        host_id,
        host_url,
        host_name,
        host_since,
        host_location,
        host_about,
        host_response_time,
        host_response_rate,
        host_acceptance_rate,
        host_is_superhost,
        host_thumbnail_url,
        host_picture_url,
        host_neighbourhood,
        host_listings_count,
        host_total_listings_count,
        host_has_profile_pic,
        host_identity_verified,
        calculated_host_listings_count,
        calculated_host_listings_count_entire_homes,
        calculated_host_listings_count_private_rooms,
        calculated_host_listings_count_shared_rooms
    FROM
        Listings
)
WITH NO DATA;

ALTER TABLE Host
    ADD PRIMARY KEY (host_id),
    ALTER COLUMN host_url SET NOT NULL
;

INSERT INTO Host
    SELECT
        host_id,
        host_url,
        host_name,
        host_since,
        host_location,
        host_about,
        host_response_time,
        host_response_rate,
        host_acceptance_rate,
        host_is_superhost,
        host_thumbnail_url,
        host_picture_url,
        host_neighbourhood,
        host_listings_count,
        host_total_listings_count,
        host_has_profile_pic,
        host_identity_verified,
        calculated_host_listings_count,
        calculated_host_listings_count_entire_homes,
        calculated_host_listings_count_private_rooms,
        calculated_host_listings_count_shared_rooms
    FROM
        Listings
ON CONFLICT (host_id) DO NOTHING
;

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
21980 rows affected.


[]

In [157]:
%%sql
SELECT *
FROM Host
LIMIT 3

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
3 rows affected.


host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
68997,https://www.airbnb.com/users/show/68997,Matthias,2010-01-08,"Rio de Janeiro, Brazil","I am a journalist/writer. Lived in NYC for 15 years. I am now based in Rio and published 3 volumes of travel stories on AMAZ0N: ""The World Is My Oyster"". If you have never been to Rio, check out the first story, and you'll get an idea. Apart from Rio, you'll find 29 other travel stories from all around the globe.",within an hour,100%,96%,t,https://a0.muscache.com/im/pictures/user/67b13cea-8c11-49c0-a08d-7f42c330676e.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/67b13cea-8c11-49c0-a08d-7f42c330676e.jpg?aki_policy=profile_x_medium,Copacabana,2,5,True,True,1,1,0,0
102840,https://www.airbnb.com/users/show/102840,Viviane,2010-04-03,"Rio de Janeiro, Brazil","Hi guys, Viviane is a commercial photographer, an avid world traveler, (a former photographer for Airbnb) and an Airbnb superhost. And a free lance photographer for other wonderful clients. She loves life and meeting people. We work together in providing the best accommodation to people and we are firm believers of enjoying the moment as a prime attitude towards life!",within an hour,100%,80%,t,https://a0.muscache.com/im/pictures/user/315ddc81-bea3-4bf0-8fc7-be197a6541ff.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/315ddc81-bea3-4bf0-8fc7-be197a6541ff.jpg?aki_policy=profile_x_medium,Copacabana,1,5,True,True,1,1,0,0
153691,https://www.airbnb.com/users/show/153691,Patricia Miranda & Paulo,2010-06-27,"Rio de Janeiro, Brazil","Hello, We are Patricia Miranda and Paulo. We are a couple who love to meet new people, new cultures, we both are very easy going persons, We are retired after working for several years in tourism and an international airline company. We also used do host in our own residence International students from all over the world. We are gay friendly and everybody is welcome! !",within an hour,100%,98%,t,https://a0.muscache.com/im/users/153691/profile_pic/1277774787/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/153691/profile_pic/1277774787/original.jpg?aki_policy=profile_x_medium,Copacabana,1,2,True,True,1,1,0,0


In [None]:
-- multivalorado
%%sql
DROP TABLE IF EXISTS Host_verifications CASCADE;
CREATE TABLE Host_verifications AS (
    host_id,
    verification_field
);

In [158]:
-- Estamos utilizando a restricao de chave para nao obtermos dados distorcidos nas consultas,
-- dessa forma, estamos descartando as tuplas que possuem reviewer_id que ja foi inserido

%%sql
DROP TABLE IF EXISTS Reviewer CASCADE;
CREATE TABLE Reviewer AS (
    SELECT
        reviewer_id,
        reviewer_name
    FROM
        Reviews
)
WITH NO DATA;

ALTER TABLE Reviewer
    ADD PRIMARY KEY (reviewer_id)
;

INSERT INTO Reviewer
    SELECT
        reviewer_id,
        reviewer_name
    FROM
        Reviews
ON CONFLICT (reviewer_id) DO NOTHING
;

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
563587 rows affected.


[]

In [139]:
%%sql
SELECT *
FROM Reviewer
LIMIT 3

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
3 rows affected.


reviewer_id,reviewer_name
135370,Tia
2598011,Evelyn
2465739,Camila


In [159]:
-- Estamos utilizando a restricao de chave para nao obtermos dados distorcidos nas consultas,
-- dessa forma, estamos descartando as tuplas que possuem id que ja foi inserido

%%sql
DROP TABLE IF EXISTS Reviews_norm CASCADE;
CREATE TABLE Reviews_norm AS (
    SELECT
        id,
        listing_id,
        reviewer_id,
        date,
        comments
    FROM
        Reviews
)
WITH NO DATA;

ALTER TABLE Reviews_norm
    ADD PRIMARY KEY (id),
    ALTER COLUMN listing_id SET NOT NULL,
    ALTER COLUMN reviewer_id SET NOT NULL
;

INSERT INTO Reviews_norm
    SELECT
        id,
        listing_id,
        reviewer_id,
        date,
        comments
    FROM
        Reviews
ON CONFLICT (id) DO NOTHING
;

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
25706 rows affected.


[]

In [None]:
%%sql
SELECT *
FROM Reviews_norm
LIMIT 5

In [142]:
%%sql
DROP TABLE IF EXISTS Calendar_norm CASCADE;
CREATE TABLE Calendar_norm AS (
    SELECT
        listing_id,
        date,
        available,
        price,
        adjusted_price
    FROM
        Calendar
)
WITH NO DATA;

ALTER TABLE Calendar_norm
    ADD PRIMARY KEY (listing_id, date)
;

INSERT INTO Calendar_norm
    SELECT
        listing_id,
        date,
        available,
        price,
        adjusted_price
    FROM
        Calendar
;


 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
13145595 rows affected.


[]

In [143]:
%%sql
SELECT *
FROM Calendar_norm
LIMIT 3

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
3 rows affected.


listing_id,date,available,price,adjusted_price
17878,2023-12-27,f,$350.00,
17878,2023-12-28,f,$350.00,
17878,2023-12-29,f,$350.00,


In [150]:
%%sql
DROP TABLE IF EXISTS Info_listings CASCADE;
CREATE TABLE Info_listings AS (
    SELECT
        listing_id,
        minimum_nights,
        maximum_nights
    FROM
        Calendar
)
WITH NO DATA;

ALTER TABLE Info_listings
    ADD CONSTRAINT listing_id_unique UNIQUE (listing_id)
;

INSERT INTO Info_listings
    SELECT
        listing_id,
        minimum_nights,
        maximum_nights
    FROM
        Calendar
ON CONFLICT (listing_id) DO NOTHING
;

 * postgresql://postgres:***@localhost/airbnb_icmc_2
   postgresql://postgres:***@localhost/postgres
Done.
Done.
Done.
36008 rows affected.


[]

In [None]:
%%sql
SELECT *
FROM Info_listings
LIMIT 5