In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_users = pd.read_csv("./airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
test_users = pd.read_csv("./airbnb-recruiting-new-user-bookings/test_users.csv.zip")

In [3]:
# 將 'timestamp_first_active' 和 'date_account_created' 列轉換為 datetime 類型
train_users['timestamp_first_active'] = pd.to_datetime(train_users['timestamp_first_active'], format='%Y%m%d%H%M%S')
train_users['date_account_created'] = pd.to_datetime(train_users['date_account_created'], format='%Y-%m-%d')

# 計算每個用戶的註冊後第一次操作的時間與註冊時間之間的時間差
train_users['time_lag'] = (train_users['timestamp_first_active'] - train_users['date_account_created']).dt.seconds


In [4]:
# 刪除無用的特徵
train_users.drop(['id', 'date_account_created', 'timestamp_first_active'], axis=1, inplace=True)

In [5]:
# 將目標變數提取出來
y_train = train_users['country_destination']

In [6]:
# 將訓練集和測試集合併起來
all_users = pd.concat([train_users.drop(['country_destination'], axis=1), test_users])

In [7]:
# 日期和時間特徵提取
all_users['date_account_created'] = pd.to_datetime(all_users['date_account_created'])
all_users['timestamp_first_active'] = pd.to_datetime((all_users.timestamp_first_active // 1000000), format='%Y%m%d')
all_users['dac_year'] = all_users['date_account_created'].dt.year
all_users['dac_month'] = all_users['date_account_created'].dt.month
all_users['dac_day'] = all_users['date_account_created'].dt.day
all_users['tfa_year'] = all_users['timestamp_first_active'].dt.year
all_users['tfa_month'] = all_users['timestamp_first_active'].dt.month
all_users['tfa_day'] = all_users['timestamp_first_active'].dt.day

In [8]:
# 將 'first_browser' 列轉換為數值類型
all_users['first_browser'], _ = pd.factorize(all_users['first_browser'])

# 計算地理位置相關特徵
all_users['distance'] = np.sqrt((all_users['signup_flow'] - all_users['first_browser']) ** 2)


In [9]:
# One-Hot 編碼或者 Label Encoding 編碼
all_users['gender'] = all_users['gender'].replace('-unknown-', np.nan)
all_users['gender'] = all_users['gender'].fillna(all_users['gender'].mode()[0])
all_users['gender'] = pd.factorize(all_users['gender'])[0]
all_users['signup_method'] = pd.factorize(all_users['signup_method'])[0]
all_users['language'] = pd.factorize(all_users['language'])[0]
all_users['affiliate_channel'] = pd.factorize(all_users['affiliate_channel'])[0]
all_users['affiliate_provider'] = pd.factorize(all_users['affiliate_provider'])[0]
all_users['first_affiliate_tracked'] = pd.factorize(all_users['first_affiliate_tracked'].fillna('missed'))[0]
all_users['signup_app'] = pd.factorize(all_users['signup_app'])[0]
all_users['first_device_type'] = pd.factorize(all_users['first_device_type'])[0]
all_users['first_browser'] = pd.factorize(all_users['first_browser'])[0]

In [10]:
# 計算統計量
all_users['age'] = all_users['age'].fillna(all_users['age'].mean())

In [11]:
# 特徵交叉
all_users['age_distance'] = all_users['age'] * all_users['distance']

In [12]:
all_users = all_users.drop(['id','date_first_booking','date_account_created', 'timestamp_first_active'], axis=1)
all_users = all_users.fillna(-1)
all_users

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,time_lag,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,distance,age_distance
0,0,47.14531,0,0,0,0,0,0,0,0,0,16375.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.000000
1,1,38.00000,0,0,0,1,1,0,0,0,0,64089.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.000000
2,0,56.00000,1,3,0,0,0,0,0,1,1,83567.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,112.000000
3,0,42.00000,0,0,0,0,0,0,0,0,2,21689.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,84.000000
4,0,41.00000,1,0,0,0,0,0,0,0,0,22265.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62091,0,31.00000,1,0,0,0,0,0,0,1,1,-1.0,2014.0,9.0,30.0,2014.0,9.0,30.0,1.0,31.000000
62092,0,47.14531,1,23,7,0,0,0,3,7,4,-1.0,2014.0,9.0,30.0,2014.0,9.0,30.0,19.0,895.760898
62093,0,47.14531,1,0,2,0,0,3,0,1,2,-1.0,2014.0,9.0,30.0,2014.0,9.0,30.0,2.0,94.290621
62094,0,47.14531,1,0,0,5,1,1,0,0,3,-1.0,2014.0,9.0,30.0,2014.0,9.0,30.0,3.0,141.435931


In [13]:
# 降維方法
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
all_users_pca = pca.fit_transform(all_users)

In [14]:
# 將訓練集和測試集分開
X_train = all_users_pca[:len(y_train)]
X_test = all_users_pca[len(y_train):]

# 檢查特徵矩陣的大小
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_test.shape: ", X_test.shape)


X_train.shape:  (213451, 5)
y_train.shape:  (213451,)
X_test.shape:  (62096, 5)


In [15]:
# 將 NumPy 數組轉換為 DataFrame 對象
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)
y_train_df = pd.DataFrame(y_train)

# 將 DataFrame 對象保存為 CSV 文件
X_train_df.to_csv('X_train.csv', index=False)
X_test_df.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)