In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import os

# Import the tpot regressor
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

In [2]:
PROJECT_DIRECTORY = os.getcwd()
DATA_DIRECTORY = os.path.join(PROJECT_DIRECTORY,'data')
DATA_CSV = os.path.join(DATA_DIRECTORY, "summer_training_data.csv")

In [3]:
data = pd.read_csv(DATA_CSV)

In [4]:
## Convert to dates
data.travel_date = pd.to_datetime(data.travel_date, format='%Y/%m/%d')
data.booking_date = pd.to_datetime(data.booking_date, format='%Y/%m/%d')

## Drop what we don't need
data.dropna(inplace=True)
data.drop('accomodation', axis=1, inplace=True)
data.drop('accom_location', axis=1, inplace=True)
data.drop('destination', axis=1, inplace=True)
data.drop('accom_id', axis=1, inplace=True)
data = data.drop('departure_airport', axis=1)
data = data.drop('accom_type', axis=1)
data = data.drop('accom_board_basis', axis=1)

## Add Weeks, important for seasonality
data['travel_week'] = data['travel_date'].dt.week
data['booking_week'] = data['booking_date'].dt.week

data.drop('travel_date', axis=1, inplace=True)
data.drop('booking_date', axis=1, inplace=True)




## Re-Index
data.index = range(len(data))

In [5]:
y=np.array(data.price_per_person)

data.drop('price_per_person', axis=1, inplace=True)
X=np.array(data)

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.33, random_state=42)

In [6]:
# Create a tpot object with a few parameters
tpot = TPOTRegressor(scoring = 'r2', 
                    max_time_mins = 480, 
                    n_jobs = -1,
                    verbosity = 2,
                    cv = 5,
                    generations=100, 
                    population_size=100, 
                    random_state=42, 
                    warm_start=True)

In [None]:
# Fit the tpot model on the training data
tpot.fit(X_train, y_train)