In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


In [3]:
df = pd.read_excel("../../99_data/加工データ/df_for_classification.xlsx")

In [6]:
df_airbnb = df[df['is_airbnb'] == True]
df_hotel = df[df['is_airbnb'] == False]

# 特徴量とターゲット変数を分離
X = df_hotel.drop(['hotel_rank', 'address', 'num_of_supply', 'area', 'day_of_week', 'check_in'], axis=1)
Y = df_hotel['hotel_rank']

# NaN値を処理するためにSimpleImputerを使用
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# データを8:2の割合で分割
X_train, X_test, Y_train, Y_test = train_test_split(X_imputed, Y, test_size=0.2, random_state=42)

# 結果の確認
print(X_test.shape)
print(Y_train.shape)

# 各アルゴリズムでの訓練と評価を行う関数
def evaluate_model(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    return rmse, model

# モデルのリスト
models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    # ("LightGBM", lgb.LGBMClassifier(random_state=42)),
    ("k-NN", KNeighborsClassifier()),
    # ("SVM", SVC(kernel='rbf', C=1, gamma='scale', random_state=42)),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("Naive Bayes", GaussianNB())
]

# 各モデルの評価
results = []
for name, model in models:
    rmse, trained_model = evaluate_model(model, X_train, X_test, Y_train, Y_test)
    results.append((name, rmse, trained_model))
    print(f"{name} RMSE: {rmse}")
    print(f"Trained Model: {trained_model}\n")

# 最もRMSEが小さいモデルを選択
best_model_details = min(results, key=lambda x: x[1])
best_model, best_rmse, best_trained_model = best_model_details

print(f"Best Model: {best_model} with RMSE: {best_rmse}")
print(f"Best Trained Model Details: {best_trained_model}")

# df_airbnbの特徴量を準備
X_airbnb = df_airbnb.drop(['hotel_rank', 'address', 'num_of_supply', 'area', 'day_of_week', 'check_in'], axis=1)

# NaN値を処理するためにSimpleImputerを使用（同じインスタンスを使用して一貫性を保つ）
X_airbnb_imputed = imputer.transform(X_airbnb)

# 最良のモデルで予測
df_airbnb['hotel_rank'] = best_trained_model.predict(X_airbnb_imputed)

# 結果を表示
print(df_airbnb[['hotel_rank']])


(57618, 7)
(230468,)
Decision Tree RMSE: 0.5469075951791296
Trained Model: DecisionTreeClassifier(random_state=42)

Random Forest RMSE: 0.4644874065032304
Trained Model: RandomForestClassifier(random_state=42)

k-NN RMSE: 1.121788228933509
Trained Model: KNeighborsClassifier()

Logistic Regression RMSE: 1.304927553288214
Trained Model: LogisticRegression(max_iter=1000, random_state=42)

Naive Bayes RMSE: 1.2511302120861956
Trained Model: GaussianNB()

Best Model: Random Forest with RMSE: 0.4644874065032304
Best Trained Model Details: RandomForestClassifier(random_state=42)
      hotel_rank
0            3.0
1            3.0
2            3.0
3            3.0
4            3.0
...          ...
7750         0.0
7751         0.0
7752         0.0
7753         5.0
7754         5.0

[7755 rows x 1 columns]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airbnb['hotel_rank'] = best_trained_model.predict(X_airbnb_imputed)


In [10]:
# df_airbnbとdf_hotelを縦に結合
df_combined = pd.concat([df_airbnb, df_hotel], axis=0)

# エクセルファイルとして出力
df_combined.to_excel("../99_data/加工データ/df_classified_DecisionTree.xlsx", index=False)