In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

In [7]:
print(df.columns)

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', 'click_bool',
       'gross_bookings_usd', 'booking_bool', 'bool_visitor_hist',
       'visitor_hist_starrating_prop_id_mean',
       'visitor_hist_starrating_prop_id_median',
       'visitor_hist_adr_usd_prop_id_mean',
       'v

In [25]:
df = pd.read_csv("/Users/eva/Documents/Study/Y1S2/DMT/assignment2/cleaned_training_set_VU_DM.csv")
df_test = pd.read_csv("/Users/eva/Documents/Study/Y1S2/DMT/assignment2/cleaned_test_set_VU_DM.csv")
combined_df = pd.concat([df, df_test], ignore_index=False)

In [26]:
features = ['prop_log_historical_price', 'price_usd', 'srch_length_of_stay', 'srch_booking_window', 
            'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_query_affinity_score', 'orig_destination_distance', 
            'comp1_rate_percent_diff', 'comp2_rate_percent_diff']


# 计算每个组合的均值、中位数和标准差
grouped = combined_df.groupby(['prop_id'])
stats = grouped[features].agg(['mean', 'median'])

original_train_len = len(df)
original_test_len = len(df_test)

for feature in features:
    for id_col in ['prop_id']:
        for stat in ['mean', 'median']:
            column_name = f'{feature}_{id_col}_{stat}'
            combined_df[column_name] = grouped[feature].transform(stat)

In [27]:
combined_df['people_per_room'] = (combined_df['srch_adults_count'] + combined_df['srch_children_count']) / combined_df['srch_room_count']

# Interaction: people per room with location score
combined_df['people_location_interaction'] = combined_df['people_per_room'] * combined_df['prop_location_score1']

combined_df['hist_price_interaction'] = combined_df['visitor_hist_adr_usd'] / combined_df['price_usd']

combined_df['rating_review_interaction'] = combined_df['prop_starrating'] * combined_df['prop_review_score']

# process prop_location_score1 is 0
non_zero_min_half = combined_df[combined_df['prop_location_score1'] > 0]['prop_location_score1'].min() / 2
combined_df['price_location_ratio'] = combined_df['price_usd'] / np.where(combined_df['prop_location_score1'] == 0, non_zero_min_half, combined_df['prop_location_score1'])

In [28]:
# add columns to compute price per adult and price per person
combined_df['price_per_adult'] = combined_df['price_usd'] / combined_df['srch_adults_count']
combined_df['price_per_person'] = combined_df['price_usd'] / (combined_df['srch_adults_count'] + combined_df['srch_children_count'])

# add prop_historical_price
combined_df['prop_historical_price'] = np.exp(combined_df['prop_log_historical_price'])

# add prop_clicked_prob by taking 10 to the power of “srch query affinity score”
combined_df['prop_clicked_prob'] = 10 ** combined_df['srch_query_affinity_score']

# add child or not bool
combined_df['child_bool'] = combined_df['srch_children_count'] > 0


In [30]:
print(combined_df['prop_review_score'].value_counts())

non_zero_min = combined_df[combined_df['prop_review_score'] > 0]['prop_review_score'].min()/5
combined_df['price_review_ratio'] = combined_df['price_usd'] / np.where(combined_df['prop_review_score'] == 0, non_zero_min, combined_df['prop_review_score'])

print(combined_df['prop_starrating'].value_counts())
non_zero_min_half = combined_df[combined_df['prop_starrating'] > 0]['prop_starrating'].min() / 5
combined_df['price_starrating_ratio'] = combined_df['price_usd'] / np.where(combined_df['prop_starrating'] == 0, non_zero_min_half, combined_df['prop_starrating'])

prop_review_score
4.0    3115351
4.5    3093746
3.5    1536219
3.0     750811
0.0     482116
5.0     472194
2.5     287427
2.0     124191
1.5      28706
1.0      26769
Name: count, dtype: int64
prop_starrating
3    3897419
4    3045848
2    1788994
5     812961
0     337794
1      34514
Name: count, dtype: int64


In [31]:
features_to_remove = [
    'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
    'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
    'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
    'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
    'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
    'comp8_rate_percent_diff'
]

# Assuming df is your DataFrame
combined_df = combined_df.drop(columns=features_to_remove)

In [32]:
combined_df['price_rank_percentile'] = combined_df.groupby('srch_id')['price_usd'].rank(pct=True)

combined_df['location1_rank_percentile'] = combined_df.groupby('srch_id')['prop_location_score1'].rank(pct=True)



In [33]:
combined_df['price_per_person_rank_percentile'] = combined_df.groupby('srch_id')['price_per_person'].rank(pct=True)

In [34]:
df = combined_df.iloc[:original_train_len]
df_test = combined_df.iloc[original_train_len:original_train_len + original_test_len]

In [18]:
print(df_test.shape)

(4959183, 69)


In [35]:
# print columns with NA in df
print(df.columns[df.isnull().any()].tolist())

['hist_price_interaction']


In [36]:
# print columns with NA in df_test
print(df_test.columns[df_test.isnull().any()].tolist())


['position', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'hist_price_interaction']


In [37]:
# drop ['position', 'gross_bookings_usd'] in df_test
df_test = df_test.drop(columns=['position', 'gross_bookings_usd', 'click_bool', 'booking_bool'])

In [39]:
#to_csv
df_test.to_csv('/Users/eva/Documents/Study/Y1S2/DMT/assignment2/best_feature_engineered_test_set_VU_DM.csv', index=False)

In [38]:

chunk_size = 500000  # This number can change 

# Split the CSV into chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]
    chunk.to_csv(f'/Users/eva/Documents/Study/Y1S2/DMT/assignment2/best_feature_engineered_training_chunk_{i//chunk_size}.csv', index=False)

In [None]:
# print df shape
print(df_test.shape)

In [5]:
df_cleaned_test = pd.read_csv('/Users/eva/Documents/Study/Y1S2/DMT/assignment2/cleaned_test_set_VU_DM.csv')
print(df_cleaned_test.shape)

(4959183, 51)
