# TravelTide

## Imports

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text
from datetime import datetime
from collections import defaultdict

## SQL Connection

#### Cohort Definition and Filtering

In [2]:
# Deine Verbindungszeichenfolge
connection_string = "postgresql://Test:bQNxVzJL4g6u@ep-noisy-flower-846766.us-east-2.aws.neon.tech/TravelTide"

# Engine erstellen
engine = create_engine(connection_string)

In [3]:
query = """
-- Aktive Nutzer mit mehr als 7 Sessions seit 4. Januar 2023
WITH valid_users AS (
  SELECT user_id
  FROM sessions
  WHERE session_start >= '2023-01-04'
  GROUP BY user_id
  HAVING COUNT(session_id) >= 3
)

SELECT *
FROM sessions
WHERE session_start >= '2023-01-04'
  AND user_id IN (SELECT user_id FROM valid_users);
"""
df_sessions = pd.read_sql_query(query, engine)

## Sessions

In [4]:
user_ids = df_sessions['user_id'].unique().tolist()
trip_ids = df_sessions['trip_id'].dropna().unique().tolist()

In [5]:
df_sessions["booking_made"] = df_sessions["trip_id"].notna().astype(int)

In [6]:
df_sessions["session_duration_min"] = ((df_sessions["session_end"] - df_sessions["session_start"]).dt.total_seconds() / 60).round(2)

## Users

In [7]:
def get_users_in_chunks(user_ids, engine, chunk_size=10000):
    all_users = []

    for i in range(0, len(user_ids), chunk_size):
        chunk = user_ids[i:i+chunk_size]
        query = text("""
            SELECT * FROM users
            WHERE user_id = ANY(:user_ids)
        """)
        df_chunk = pd.read_sql_query(query, engine, params={"user_ids": chunk})
        all_users.append(df_chunk)

    return pd.concat(all_users, ignore_index=True)

# Beispielnutzung:
df_users = get_users_in_chunks(user_ids, engine)

In [8]:
# Users Age
today = pd.to_datetime("today")
df_users["age"] = (today - pd.to_datetime(df_users["birthdate"])).dt.days // 365

In [29]:
# Alters-Buckets definieren
bins = [0, 17, 24, 34, 44, 54, 64, 120]
labels = [
    '0-17', 
    '18-24', 
    '25-34', 
    '35-44',
    '45-54',
    '55-64',
    '65+'
]

# Neue Spalte: Altersgruppe
df_users['age_bucket'] = pd.cut(df_users['age'], bins=bins, labels=labels)

In [10]:
df_users["home_country"] = df_users["home_country"].str.title()

In [11]:
df_users["home_city"] = df_users["home_city"].str.title()

In [34]:
df_users

Unnamed: 0,user_id,birthdate,gender,married,has_children,home_country,home_city,home_airport,home_airport_lat,home_airport_lon,sign_up_date,age,age_bucket
0,440,1967-01-26,M,False,False,Usa,Long Beach,LGB,33.818,-118.151,2021-04-17,58,55-64
1,564,1986-07-06,F,False,False,Usa,New York,LGA,40.777,-73.872,2021-04-19,38,35-44
2,1269,1991-08-21,F,False,False,Canada,Montreal,YMX,45.680,-74.039,2021-05-11,33,25-34
3,1279,1966-11-15,F,True,True,Usa,San Antonio,SAT,29.534,-98.470,2021-05-11,58,55-64
4,4145,1965-10-12,M,True,True,Canada,Quebec,YQB,46.788,-71.398,2021-06-02,59,55-64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
611211,687002,1967-09-14,F,False,False,Usa,Chicago,MDW,41.786,-87.752,2023-03-23,57,55-64
611212,687342,2000-12-14,M,False,False,Canada,Calgary,YYC,51.114,-114.020,2023-03-23,24,18-24
611213,689837,2004-12-06,M,False,False,Usa,Atlanta,ATL,33.640,-84.427,2023-03-24,20,18-24
611214,690058,1994-07-19,F,False,True,Usa,Colorado Springs,COS,38.806,-104.700,2023-03-24,30,25-34


## Flights

In [12]:
def get_flight_in_chunks(trip_ids, engine, chunk_size=10000):
    all_flights = []

    for i in range(0, len(trip_ids), chunk_size):
        chunk = trip_ids[i:i+chunk_size]
        query = text("""
            SELECT * FROM flights
            WHERE trip_id = ANY(:trip_ids)
        """)
        df_chunk = pd.read_sql_query(query, engine, params={"trip_ids": chunk})
        all_flights.append(df_chunk)

    return pd.concat(all_flights, ignore_index=True)

# Beispielnutzung:
df_flights = get_flight_in_chunks(trip_ids, engine)

In [13]:
df_flights["destination"] = df_flights["destination"].str.title()

## Hotels

In [14]:
def get_hotel_in_chunks(trip_ids, engine, chunk_size=10000):
    all_hotels = []

    for i in range(0, len(trip_ids), chunk_size):
        chunk = trip_ids[i:i+chunk_size]
        query = text("""
            SELECT * FROM hotels
            WHERE trip_id = ANY(:trip_ids)
        """)
        df_chunk = pd.read_sql_query(query, engine, params={"trip_ids": chunk})
        all_hotels.append(df_chunk)

    return pd.concat(all_hotels, ignore_index=True)

# Beispielnutzung:
df_hotels = get_hotel_in_chunks(trip_ids, engine)

In [15]:
# Splittet an ' - ', max 1 Split
df_hotels[['hotel_name', 'location']] = df_hotels['hotel_name'].str.split(' - ', n=1, expand=True)

In [16]:
df_hotels["location"] = df_hotels["location"].str.title()

In [17]:
df_hotels = df_hotels.loc[df_hotels["nights"] >= 0].copy()

## Aggregation Tables

#### Sessions

In [35]:
df_sessions.head(1)

Unnamed: 0,session_id,user_id,trip_id,session_start,session_end,flight_discount,hotel_discount,flight_discount_amount,hotel_discount_amount,flight_booked,hotel_booked,page_clicks,cancellation,booking_made,session_duration_min
0,73956-cfd4601ebfea4c198cd738d43cdc848f,73956,,2023-03-29 12:26:00,2023-03-29 12:27:07,True,False,0.1,,False,False,9,False,0,1.12


In [49]:
# Annahme: Dein Session-DataFrame heißt df_sessions
# Wichtige Spalten: "user_id", "session_id", "session_duration", "booking_made" (o.Ä.)

session_agg = (
    df_sessions.groupby("user_id")
    .agg(
        session_count=("session_id", "count"),           # Wieviele Sessions
        booking_count=("trip_id", "count"),               # Wieviele Buchungen
        cancellations=("cancellation", lambda x: (x == True).sum()),         # Wieviele Stornierungen
        avg_session_duration_min=("session_duration_min", "mean"),  # Durchschnittliche Session-Dauer
        booking_conversion_rate=("booking_made", "mean")  # Anteil der Sessions mit Buchung
    )
    .reset_index()
)

# Schön runden
session_agg["avg_session_duration_min"] = session_agg["avg_session_duration_min"].round(2)
session_agg["booking_conversion_rate"] = ((session_agg["booking_conversion_rate"])*100).round(2)
# Neue Spalte: Storno-Rate
session_agg["cancellation_rate"] = ((session_agg["cancellations"] / session_agg["booking_count"])*100).round(2)

# Achtung auf Division durch 0:
session_agg["cancellation_rate"] = session_agg["cancellation_rate"].fillna(0)

session_agg.head()

Unnamed: 0,user_id,session_count,booking_count,cancellations,avg_session_duration_min,booking_conversion_rate,cancellation_rate
0,55,4,3,1,19.46,75.0,33.33
1,57,3,1,0,1.19,33.33,0.0
2,62,3,1,0,2.12,33.33,0.0
3,71,3,2,0,2.07,66.67,0.0
4,85,3,0,0,1.02,0.0,0.0


#### Flights 

In [20]:
# Merge flights und sessions, um user_id zu bekommen
df_flights_merged = df_flights.merge(df_sessions[["trip_id", "user_id"]], on="trip_id", how="left")

In [21]:
flight_agg = df_flights_merged.groupby("user_id").agg(
    flight_booking_count=("trip_id", "count"),
    total_flight_spent=("base_fare_usd", "sum"),
    avg_seats_booked=("seats", "mean")
).reset_index()

flight_agg["total_flight_spent"] = flight_agg["total_flight_spent"].round(2)
flight_agg["avg_seats_booked"] = flight_agg["avg_seats_booked"].round(2)

flight_agg.head()

Unnamed: 0,user_id,flight_booking_count,total_flight_spent,avg_seats_booked
0,55,3,4141.16,1.67
1,57,1,184.81,1.0
2,62,1,753.95,1.0
3,71,2,767.51,1.0
4,88,3,2878.04,1.33


#### Hotels

In [27]:
df_hotels_merged = df_hotels.merge(df_sessions[["trip_id", "user_id"]], on="trip_id", how="left")

In [28]:
hotel_agg = df_hotels_merged.groupby("user_id").agg(
    hotels_booking_count=("trip_id", "count"),
    total_hotel_spent=("hotel_per_room_usd", "sum"),
    avg_nights_booked=("nights", "mean")
).reset_index()

hotel_agg["avg_nights_booked"] = hotel_agg["avg_nights_booked"].round(2)
hotel_agg["total_hotel_spent"] = hotel_agg["total_hotel_spent"].round(2)

hotel_agg.head()

Unnamed: 0,user_id,hotels_booking_count,total_hotel_spent,avg_nights_booked
0,55,1,308.0,4.0
1,57,1,97.0,0.0
2,62,1,94.0,1.0
3,71,1,68.0,9.0
4,88,4,1462.0,2.5


### .CSV speichern

In [50]:
# Beispiel: DataFrame speichern
df_sessions.to_csv('df_sessions.csv', index=False)
df_flights.to_csv('df_flights.csv', index=False)
df_hotels.to_csv('df_hotels.csv', index=False)
df_users.to_csv('df_users.csv', index=False)

# Wenn du schon aggregierte User-Features hast:
session_agg.to_csv('sessions_features.csv', index=False)
flight_agg.to_csv('flights_features.csv', index=False)
hotel_agg.to_csv('hotels_features.csv', index=False)