In [None]:
import pandas as pd

import numpy as np
from tqdm import tqdm

from numba import jit
import time

import os

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read in the dataframe you want to clean, can be a csv, hdf

traindf = pd.read_csv("./data/test_set_VU_DM.csv")
    
# traindf = traindf[:100000]

In [None]:
traindf["price_usd"][(traindf["srch_saturday_night_bool"] == 1) & (traindf["price_usd"] < 1000)].hist(bins=100, color='DarkGreen')
traindf["price_usd"][(traindf["srch_saturday_night_bool"] == 0) & (traindf["price_usd"] < 1000)].hist(bins=100)

In [None]:
@jit(nopython=True, parallel=True)
def calculate_mean_per_propid(unique_propids, 
                                    col_propids, 
                                    col_price, 
                                    col_mean, 
                                    col_std, 
                                    col_no_hotels, 
                                    col_correct_price, 
                                    col_nights,
                                    col_new_mean,
                                    col_new_std,
                                    cut_off):
    
    total = len(unique_propids)

    worse, better = 0, 0
    
    print("Cutoff: ", cut_off, "\nChanged everything to numpy arrays, starting now with processing data")
        
    for i, propid in enumerate(unique_propids):
        
        if i % 1000 == 0:
            print(cut_off, ": ", i, "/", total)
        
        indices = np.where(col_propids == propid)[0]

        prices = col_price[indices]
        mean_price = np.mean(prices)
        std_price = np.std(prices)
        
        col_mean[indices] = mean_price
        col_std[indices] = std_price
        col_no_hotels[indices] = indices.size
        
        col_new_mean[indices] = mean_price
        col_new_std[indices] = std_price
                       
        if std_price > cut_off:
            col_correct_price[indices] = col_price[indices] / col_nights[indices]
            prices = col_correct_price[indices]
            
            std_price_after = np.std(prices)
            
            # rollback
            if std_price_after > std_price:
                worse += 1
                col_correct_price[indices] = col_price[indices]
            else:
                mean_price_after = np.mean(prices)
                
                col_new_mean[indices] = mean_price_after
                col_new_std[indices] = std_price_after
                better += 1
    
    print("Better: ", better, "\nWorse: ", worse)
    print("Std before: ", np.mean(col_std), "\nStd after: ", np.mean(col_new_std))


In [None]:
def compute_numba(df, cut_off):
    time_start = time.time()
        
    # add columns
    columnames = list(df.columns)
    columnames.extend(["avg_price_propid", "std_avg_price_propid", "amount_hotels", 
                       "avg_price_propid_after", "std_avg_price_propid_after"])

    df = df.reindex(columns=columnames)

    # copy prices (for now, at the end we will just update the price i suppose)
    df["price_correction"] = df["price_usd"]

    # get unique propids and put them into numpy array
    propids = np.array(df["prop_id"].unique())
    
    print("Appended columns.")
   
    calculate_mean_per_propid(propids,
                                    df['prop_id'].to_numpy(),
                                    df['price_usd'].to_numpy(),
                                    df['avg_price_propid'].to_numpy(),
                                    df['std_avg_price_propid'].to_numpy(),
                                    df['amount_hotels'].to_numpy(),
                                    df['price_correction'].to_numpy(),
                                    df['srch_length_of_stay'].to_numpy(),
                                    df['avg_price_propid_after'].to_numpy(),
                                    df['std_avg_price_propid_after'].to_numpy(),
                                     cut_off)   
    
    print("This took: ", time.time() - time_start)
        
    display(df.head())
    return df

In [None]:
# display(traindf.head())

cutoff = 0
df = compute_numba(traindf, cutoff)

In [None]:
display(df[['prop_id', 'amount_hotels', 'avg_price_propid', 'price_usd', 'std_avg_price_propid', 'srch_length_of_stay', 'price_correction', 'avg_price_propid_after', 'std_avg_price_propid_after']][df['prop_id'] == 37304])

display(df[['prop_id', 'amount_hotels', 'avg_price_propid', 'price_usd', 'std_avg_price_propid', 'srch_length_of_stay', 'price_correction', 'avg_price_propid_after', 'std_avg_price_propid_after']].describe())

print(len(df['amount_hotels']), len(df['amount_hotels'].notna()))

In [None]:

print("Std before: ", df['std_avg_price_propid'].mean())
print("Std after: ", df['std_avg_price_propid_after'].mean())
df['std_avg_price_propid'][df['std_avg_price_propid'] < 2000].hist(bins=100)

plt.figure()
df['std_avg_price_propid_after'][df['std_avg_price_propid'] < 2000].hist(bins=100)

In [None]:
display(df[['prop_id','std_avg_price_propid', 'std_avg_price_propid_after']][df['std_avg_price_propid_after'] > 10000])
display(df[df['prop_id'] == 13878])

display(df[['prop_id', 'amount_hotels', 'avg_price_propid', 'price_usd', 'std_avg_price_propid', 'srch_length_of_stay', 'price_correction', 'avg_price_propid_after', 'std_avg_price_propid_after']][df['prop_id'] == 13878])

In [None]:
df.to_hdf("corrected_price_testset_rollback.hdf", key='df')

In [None]:
@jit(nopython=True, parallel=True)
def calculate_mean_per_srch_id(unique_srchids,
                                    col_srchids,
                                    col_price, 
                                    col_mean_per_srch_id):
    
    total = len(unique_srchids)
        
    for i, srchid in enumerate(unique_srchids):
        
        if i % 1000 == 0:
            print(i, "/", total)
        
        indices = np.where(col_srchids == srchid)[0]

        prices = col_price[indices]
        mean_price = np.mean(prices)
        
        col_mean_per_srch_id[indices] = mean_price


In [None]:
def compute_mean_numba(df):
    time_start = time.time()
        
    # add columns
    columnames = list(df.columns)
    columnames.extend(["avg_price_srchid"])

    df = df.reindex(columns=columnames)

    # get unique propids and put them into numpy array
    srchids = np.array(df["srch_id"].unique())
    
    print("Appended columns.")
   
    calculate_mean_per_srch_id(srchids,
                                    df['srch_id'].to_numpy(),
                                    df['price_correction'].to_numpy(),
                                    df['avg_price_srchid'].to_numpy())   
    
    print("This took: ", time.time() - time_start)
        
    display(df.head())
    return df

In [None]:
df = pd.read_hdf("./data/corrected_price_rollback.hdf")

df2 = compute_mean_numba(df)

In [None]:
display(df2["avg_price_srchid"])
# print(col_mean_per_srch_id)

In [None]:
df2.to_hdf("corrected_price_rollback_and_avg.hdf", key='df2')