# Optimal Price Detection

In [None]:
import pandas as pd
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

### Understanding the data

In [1]:
datac_path = 'data.csv'
benchmark_data_path = 'benchmark_data.csv'

datac = pd.read_csv(datac_path)
benchmark_data = pd.read_csv(benchmark_data_path)

datac_info = datac.head(), datac.describe(), datac.dtypes
benchmark_data_info = benchmark_data.head(), benchmark_data.describe(), benchmark_data.dtypes

datac_info, benchmark_data_info

((   Long  Lat    Size  Light               Price_Booking1  \
  0     5  -54  medium      0  (25.82068271676904, 0.7519)   
  1    15   34   large      1   (60.5614418412558, 0.0441)   
  2    52   33   small      1  (64.77841886497515, 0.0978)   
  3    36  -32   small      0  (43.51319658191266, 0.5933)   
  4    29   49  medium      0  (9.023792361018867, 0.9172)   
  
                   Price_Booking2                Price_Booking3  \
  0   (77.29030734504285, 0.1281)   (26.42977169141242, 0.7555)   
  1   (44.09864114154084, 0.1872)  (12.864575260713796, 0.8316)   
  2   (63.79016149960564, 0.1055)  (4.2689974099545775, 0.9454)   
  3  (20.583513389202647, 0.8548)  (51.335440281471314, 0.5031)   
  4   (34.06932873577466, 0.6672)  (42.200420426573835, 0.5574)   
  
                  Price_Booking4                Price_Booking5  \
  0  (62.99594808649998, 0.2545)  (29.367952575023835, 0.7252)   
  1  (22.15931742842302, 0.6385)   (66.75211626816896, 0.0236)   
  2   (99.117023899159

### Cleaning the data

In [2]:
def parse_price_booking(data, column_name):
    """ Parse the price and booking rate from the tuple stored as a string in the specified column. """
    data[f"{column_name}_price"], data[f"{column_name}_booking_rate"] = zip(
        *data[column_name].apply(ast.literal_eval))
    return data

datac = parse_price_booking(datac, 'Price_Booking1')
benchmark_data = parse_price_booking(benchmark_data, 'Price_Booking1')

parsed_datac = datac[['Price_Booking1', 'Price_Booking1_price', 'Price_Booking1_booking_rate']].head()
parsed_benchmark_data = benchmark_data[['Price_Booking1', 'Price_Booking1_price', 'Price_Booking1_booking_rate']].head()

parsed_datac, parsed_benchmark_data

(                Price_Booking1  Price_Booking1_price  \
 0  (25.82068271676904, 0.7519)             25.820683   
 1   (60.5614418412558, 0.0441)             60.561442   
 2  (64.77841886497515, 0.0978)             64.778419   
 3  (43.51319658191266, 0.5933)             43.513197   
 4  (9.023792361018867, 0.9172)              9.023792   
 
    Price_Booking1_booking_rate  
 0                       0.7519  
 1                       0.0441  
 2                       0.0978  
 3                       0.5933  
 4                       0.9172  ,
                  Price_Booking1  Price_Booking1_price  \
 0   (44.07422740778878, 0.2244)             44.074227   
 1  (27.403085749044997, 0.6838)             27.403086   
 2   (32.49922734531039, 0.6368)             32.499227   
 3   (92.82272915578096, 0.0011)             92.822729   
 4   (49.80177382910645, 0.1483)             49.801774   
 
    Price_Booking1_booking_rate  
 0                       0.2244  
 1                       0.6838  

In [3]:
def parse_price_booking(data, week_numbers):
    """ Parse the price and booking rate from the tuple stored as a string for specified week columns. """
    new_columns = {}

    for week in week_numbers:
        column_name = f'Price_Booking{week}'
        try:
            if column_name in data.columns:
                prices, booking_rates = zip(*data[column_name].apply(lambda x: tuple(map(float, ast.literal_eval(x)))))
                new_columns[f"{column_name}_price"] = prices
                new_columns[f"{column_name}_booking_rate"] = booking_rates
        except ValueError as e:
            print(f"Error parsing {column_name}: {e}")

    new_data = pd.DataFrame(new_columns)
    return pd.concat([data, new_data], axis=1)

datac = pd.read_csv('datac.csv')
benchmark_data = pd.read_csv('benchmark_data.csv')
week_numbers = range(1, 521)  

datac = parse_price_booking(datac, week_numbers)
benchmark_data = parse_price_booking(benchmark_data, week_numbers)

datac.to_csv('updated_datac.csv', index=False)
benchmark_data.to_csv('updated_benchmark_data.csv', index=False)

### Modeling

In [4]:
df = pd.read_csv('updated_datac.csv')

def parse_tuples(data, columns):
    new_cols = {}
    for column in columns:
        if column in data.columns:
            prices, rates = zip(*data[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else (None, None)))
            new_cols[column + '_price'] = pd.to_numeric(prices, errors='coerce')
            new_cols[column + '_rate'] = pd.to_numeric(rates, errors='coerce')
    return pd.DataFrame(new_cols)

price_booking_cols = [col for col in df.columns if 'Price_Booking' in col]
new_columns_df = parse_tuples(df, price_booking_cols)

df.drop(price_booking_cols, axis=1, inplace=True)
df = pd.concat([df, new_columns_df], axis=1)

size_mapping = {'small': 1, 'medium': 2, 'large': 3}
df['Size'] = df['Size'].map(size_mapping)

scaler = MinMaxScaler()
numerical_features = ['Long', 'Lat', 'Size', 'Light']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df.fillna(df.median(), inplace=True)

target = 'Price_Booking1_price'

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = torch.tensor(train_df.drop(target, axis=1).values).float()
y_train = torch.tensor(train_df[target].values).float()
X_test = torch.tensor(test_df.drop(target, axis=1).values).float()
y_test = torch.tensor(test_df[target].values).float()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: torch.Size([8000, 3123])
y_train shape: torch.Size([8000])
X_test shape: torch.Size([2000, 3123])
y_test shape: torch.Size([2000])


In [5]:
# Define the neural network class
class DenseNet(nn.Module):
    def __init__(self, input_features, hidden_units, output_features):
        super(DenseNet, self).__init__()
        self.layer1 = nn.Linear(input_features, hidden_units)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(hidden_units, output_features)

    def forward(self, x):
        x = self.relu1(self.layer1(x))
        x = self.relu2(self.layer2(x))
        x = self.output_layer(x)
        return x

input_features = 3123
hidden_units = 128
output_features = 1
model = DenseNet(input_features, hidden_units, output_features)

# Preprocess the data to handle NaN values
X_train[X_train != X_train] = 0  
X_test[X_test != X_test] = 0     


# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Define training parameters
epochs = 100
batch_size = 64

# Create data loaders for training data
train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)


for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        if torch.isnan(loss):
            print(f'NaN loss detected at epoch {epoch + 1}, batch {i+1}')
            continue
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    if epoch % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

model.eval()
with torch.no_grad():
    predictions = model(X_test)
    test_loss = criterion(predictions, y_test.unsqueeze(1))
print(f'Test Loss: {test_loss.item()}')

Epoch 1, Loss: 887.4035815429687
Epoch 11, Loss: 791.3966025390625
Epoch 21, Loss: 786.673716796875
Epoch 31, Loss: 773.5852888183593
Epoch 41, Loss: 762.0447426757812
Epoch 51, Loss: 755.707796875
Epoch 61, Loss: 738.5658662109375
Epoch 71, Loss: 726.3125808105468
Epoch 81, Loss: 599.1912939453125
Epoch 91, Loss: 198.05922399902343
Test Loss: 189.18678283691406


The training loss is decreasing over epochs, indicating that the model is learning. The test loss at the end gives an indication of how well the model generalizes to unseen data. Lower test loss generally indicates better generalization performance.

In [16]:
# Extract predicted prices
predicted_prices = predictions.numpy().flatten()

# Save the predicted prices to a new CSV file
with open('output_benchmark_data.csv', 'w') as file:
    for price in predicted_prices:
        file.write(f'{price}\n')


In [20]:
from sklearn.model_selection import KFold
import numpy as np

def calculate_mse(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        predictions = model(X_test)
        mse = criterion(predictions, y_test.unsqueeze(1))
    return mse.item()


# Set the number of folds
n_splits = 5

# Initialize the KFold object
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize the list to store MSE values
mse_values = []

# Iterate through the folds
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}...")

    # Split the data into training and validation sets
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

    # Convert the data to tensors
    X_fold_train_tensor = torch.tensor(X_fold_train).float()
    X_fold_val_tensor = torch.tensor(X_fold_val).float()
    y_fold_train_tensor = torch.tensor(y_fold_train).float()
    y_fold_val_tensor = torch.tensor(y_fold_val).float()

    # Train the model on the current fold
    model.train()
    optimizer.zero_grad()
    for epoch in range(epochs):
        outputs = model(X_fold_train_tensor)
        loss = criterion(outputs, y_fold_train_tensor.unsqueeze(1))
        loss.backward()
        optimizer.step()

    # Evaluate the model on the current fold
    mse = calculate_mse(model, X_fold_val_tensor, y_fold_val_tensor)
    mse_values.append(mse)

# Calculate the average MSE
average_mse = np.mean(mse_values)
print(f"Average MSE: {average_mse}")

Fold 1...


  X_fold_train_tensor = torch.tensor(X_fold_train).float()
  X_fold_val_tensor = torch.tensor(X_fold_val).float()
  y_fold_train_tensor = torch.tensor(y_fold_train).float()
  y_fold_val_tensor = torch.tensor(y_fold_val).float()


Fold 2...
Fold 3...
Fold 4...
Fold 5...
Fold 6...
Fold 7...
Fold 8...
Fold 9...
Fold 10...
Average MSE: 830.2636474609375


I don't know why MSE is so high 🥲(obviously something is wrong with model prediction)

### Another solution

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import re

# 1. Data Preprocessing
# Load the benchmark_data.csv file
df = pd.read_csv('benchmark_data.csv')

# Handle the non-numeric values in the 'Size' column
df['Size'] = df['Size'].replace('small', 1).replace('medium', 2).replace('large', 3).astype(float)

# Convert the relevant columns to the correct data types
df['Long'] = df['Long'].astype(int)
df['Lat'] = df['Lat'].astype(int)
df['Size'] = df['Size'].astype(float)
df['Light'] = df['Light'].astype(int)

for i in range(1, 521):
    col_name = f'Price_Booking{i}'
    if df[col_name].dtype == 'object':
        df[col_name] = df[col_name].apply(lambda x: float(re.findall(r'-?\d+\.?\d*', str(x))[0]) if isinstance(x, str) else x)

# Select the relevant features
X = df[['Long', 'Lat', 'Size', 'Light'] + [f'Price_Booking{i}' for i in range(1, 521)]]

# Calculate the revenue as the maximum price among all bookings
revenue_cols = [f'Price_Booking{i}' for i in range(1, 521)]
X['revenue'] = X[revenue_cols].max(axis=1)
y = X['revenue']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X.drop('revenue', axis=1), y, test_size=0.2, random_state=42)

# 2. Model Development
# Define the neural network model
model = Sequential()
model.add(Dense(128, input_dim=X.shape[1]-1, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# 3. Optimal Price Prediction
# Use the trained model to predict the optimal price for the first week of 2024
X_2024 = df[['Long', 'Lat', 'Size', 'Light'] + [f'Price_Booking{i}' for i in range(1, 521)]]
optimal_prices = model.predict(X_2024).flatten()

# 4. Output Generation
# Create the output_benchmark_data.csv file
np.savetxt('output_benchmark_data1.csv', optimal_prices, delimiter=',', fmt='%.2f')

2024-04-26 23:22:47.141442: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  df['Size'] = df['Size'].replace('small', 1).replace('medium', 2).replace('large', 3).astype(float)


Epoch 1/100


  X['revenue'] = X[revenue_cols].max(axis=1)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2170.7627 - val_loss: 167.4310
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 452.8838 - val_loss: 303.5526
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 332.3837 - val_loss: 387.9330
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 310.5214 - val_loss: 181.7452
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 278.2221 - val_loss: 259.0750
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 255.7085 - val_loss: 420.9577
Epoch 7/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 243.5995 - val_loss: 303.6208
Epoch 8/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 256.0605 - val_loss: 114.3390
Epoch 9/100
[1m50/50[0m 

I like the prevous solution more but I am just living this one here)

To be honest I am very curious about your solution and the correct predictions