In [None]:
from itertools import islice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
from tqdm import tqdm 
import sys
from datetime import datetime, timedelta, time
from scipy.optimize import curve_fit
from scipy import stats
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import random
from dateutil.relativedelta import relativedelta
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# カレントディレクトリを.pyと合わせるために以下を実行
from pathlib import Path
if Path.cwd().name == "notebook":
    os.chdir("..")

# 親ディレクトリをsys.pathに追加
sys.path.append(os.path.dirname(os.getcwd()))

# Windows MatplotlibのデフォルトフォントをMeiryoに設定
plt.rcParams['font.family'] = 'Meiryo'


# 設定
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 500)
pd.set_option('display.max_columns', 500)

# 自作モジュール
from utils.point_history_utils import open_point_history_per_shop, aggregate_date, replace_nan, set_dtype
from RS_filliing_rate.RS_fillingrate_test import plot_recycle_period, chi_squared_statistic, exp_func, power_law, KS_statistic, calc_recycle_period


# シード値の固定
seed = 0
np.random.seed(seed)
random.seed(seed)


In [None]:
# Mac Matplotlibのデフォルトフォントをヒラギノ角ゴシックに設定
plt.rcParams['font.family'] = 'Hiragino Sans'

In [None]:


# 日付特徴量の追加
def add_date_features(df):
    df = df.copy()
    df["month"] = df["年月日"].dt.month
    df["day"] = df["年月日"].dt.day
    df["year"] = df["年月日"].dt.year
    df['day_of_week'] = df['年月日'].dt.day_name()

    df["day_sin"] = np.sin(df["day"] / 31 * 2* np.pi)
    df["day_cos"] = np.cos(df["day"] / 31 * 2* np.pi)
    df.drop(columns=["day"], inplace=True)
    
    df["month_sin"] = np.sin(df["month"] / 12 * 2* np.pi)
    df["month_cos"] = np.cos(df["month"] / 12 * 2* np.pi)
    df.drop(columns=["month"], inplace=True)
    return df


def set_previous_data(df, features, days=28, years=0):
    """
    指定された日数または年数前の特徴量の値を取得する関数。
    ※年数と日数のどちらか一方のみ指定可能。
    args:
        df: データフレーム
        features: 特徴量のリスト
        days: 日数（デフォルトは28）
        years: 年数（デフォルトは0）
    return:
        df: 更新されたデータフレーム
    """
    # 日付の計算
    if years > 0:
        df['date_previous'] = df['年月日'].apply(lambda x: x - relativedelta(years=years))
        time_label = str(years) + 'years'
    else:
        df['date_previous'] = df['年月日'] - pd.Timedelta(days=days)
        time_label = str(days) + 'days'

    for feature in features:
        new_feature = feature + '_before_' + time_label
        # 一時的なデータフレームを作成
        temp_df = df[['年月日', 'super', 'shop_name_1', feature]].copy()
        temp_df.rename(columns={'年月日': 'date_previous', feature: new_feature}, inplace=True)

        # 元のデータフレームに一時的なデータフレームをマージ
        df = df.merge(temp_df, on=['super', 'shop_name_1', 'date_previous'], how='left')

    # 不要な列を削除
    df.drop('date_previous', axis=1, inplace=True)

    return df



df = pd.read_csv('data/input/point_history_per_shop_date.csv', encoding='utf-8')

df = set_dtype(df)
df = replace_nan(df)
df = add_date_features(df)
df.loc[df["filling_rate"] > 1, "filling_rate"] = 1
df = set_previous_data(df, ['amount_kg', 'filling_rate'], days=28)
df = set_previous_data(df, ['amount_kg', 'filling_rate'], years=1)

In [None]:
df[:30]

In [None]:

# Drop unnecessary columns
columns_to_drop = ['shop_id', 'shop_name', 'shop_id_1', 'リサイクル分類ID', '支店ID', 'store_opening_time',\
                    'store_closing_time', 'rps_opening_time', 'rps_closing_time','年月日', 'interval_compared_to_next', \
                        'amount','amount_kg','point','total_point','total_amount','coin', 'interval_compared_to_previous', 'total_amount_kg_per_day',\
                             '合計全天日射量(MJ/㎡)', '降雪量合計(cm)', '降水量の合計(mm)', '日照時間(時間)']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
df.head()

In [None]:


# Handle categorical variables with one-hot encoding
categorical_features = ['prefectures', 'municipality','shop_name_1','super', '天気', 'day_of_week']
df = pd.get_dummies(df, columns=categorical_features)

# Split the data into features and target
X = df.drop('filling_rate', axis=1)
y = df['filling_rate']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Evaluate the predictions
# Calculate and print evaluation metrics
def evaluate_the_predictions(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'R-squared (R2): {r2}')
    
    print("actual")
    print(y_test[:10].values)
    print("pred")
    print(y_pred[:10])


In [None]:
# Train the LightGBM model

# SimpleImputerを中央値で初期化します
# imputer = SimpleImputer(strategy='median')

# # DataFrameのNaN値を中央値で埋めます
# X_train['平均雲量(10分比)'] = pd.DataFrame(imputer.fit_transform(X_train['平均雲量(10分比)'].values.reshape(-1, 1)), columns=['平均雲量(10分比)'])
# X_test['平均雲量(10分比)'] = pd.DataFrame(imputer.fit_transform(X_test['平均雲量(10分比)'].values.reshape(-1, 1)), columns=['平均雲量(10分比)'])

# # NaN値を0で埋めます
# X_train['平均雲量(10分比)'].fillna(0, inplace=True)
# X_test['平均雲量(10分比)'].fillna(0, inplace=True)

# 正規化
# StandardScalerを初期化します
# scaler = MinMaxScaler()

# normalized_columns = ['平均気温(℃)','最高気温(℃)','最低気温(℃)','平均風速(m/s)','平均湿度(％)','平均現地気圧(hPa)','平均雲量(10分比)']
# # DataFrameを正規化します
# X[normalized_columns] = pd.DataFrame(scaler.fit_transform(X[normalized_columns]), columns=normalized_columns)


# -------------------- 上記は精度が下がったのでコメントアウト -------------------------


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_params = {
    'objective': 'regression',
    'boosting_type': 'goss',
    'seed': 0,
    'early_stopping_rounds' : 1000,
     'num_iterations' : 10000,
     'learning_rate' : 0.02,
     'max_depth': 8,
    # 'bagging_freq': 10,  # バギングを行う頻度
    # 'bagging_fraction': 0.6,  # バギングの割合
    # 'feature_fraction': 0.6,  # 特徴量サブサンプルの割合
}

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_train, label=y_train)

model = lgb.train(lgb_params, train_data, valid_sets=test_data)
# Predict and evaluate the model
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

#モデル評価
evaluate_the_predictions(y_test, y_pred)

In [None]:
# ランダムフォレストの場合
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# パイプラインの作成
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # 欠損値の代入
    ('regressor', RandomForestRegressor(
        n_estimators=100, # 決定木の数 200にしてもさほど変化なし
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt', # デフォルト:None
        random_state=42,
        verbose=1
    ))
])

# # クロスバリデーションの設定 時間かかるのでコメントアウト推奨
# cv = 5
# mse_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
# mae_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_absolute_error')
# r2_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

# # クロスバリデーションの結果を出力
# print(f'Cross-Validated Mean Squared Error (MSE): {-np.mean(mse_scores)}')
# print(f'Cross-Validated Mean Absolute Error (MAE): {-np.mean(mae_scores)}')
# print(f'Cross-Validated R-squared (R2): {np.mean(r2_scores)}')


# 訓練データとテストデータを使用したモデルの訓練と評価
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# モデルの評価
evaluate_the_predictions(y_test, y_pred)

In [None]:
plt.figure(figsize=(5, 4))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('決定係数: {}'.format(round(r2, 2)))
plt.xlabel('充填率（正解値）')
plt.ylabel('充填率（予測値）')

# Plot a line representing perfect predictions
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
# 1. 特徴量の重要性を取得
feature_importances = model.feature_importance(importance_type='split')
feature_names = X_train.columns

# 2. 特徴量の名前とその重要性を組み合わせてDataFrameを作成
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# 3. DataFrameを重要性でソート
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 4. 特徴量の重要性を棒グラフで表示
plt.figure(figsize=(5, 100))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('予測における重要度')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

ランダムフォレスト※欠損値補正必要
HistGradientBoostingRegressor※ワンホットベクター不要、欠損値補正不要
XGBoost
サポートベクターマシン
ニューラルネットワーク
リッジ回帰 (Ridge Regression) とラッソ回帰 (Lasso Regression)

# amount_kg予測

In [None]:
# Drop unnecessary columns
columns_to_drop = ['shop_id', 'shop_name', 'shop_id_1', 'リサイクル分類ID', '支店ID', 'store_opening_time',\
                    'store_closing_time', 'rps_opening_time', 'rps_closing_time','年月日', 'interval_compared_to_next', \
                        'amount','point','total_point','total_amount','coin', 'interval_compared_to_previous', 'total_amount_kg_per_day',\
                            'store_latitude', 'store_longitude', '合計全天日射量(MJ/㎡)', '降雪量合計(cm)', '降水量の合計(mm)', '日照時間(時間)', 'filling_rate']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Handle categorical variables with one-hot encoding
categorical_features = ['prefectures', 'municipality','shop_name_1','super', '天気', 'day_of_week']
df = pd.get_dummies(df, columns=categorical_features)

# Split the data into features and target
X = df.drop('amount_kg', axis=1)
y = df['amount_kg']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the LightGBM model
lgb_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'seed': 0,
    'early_stopping_rounds' : 1000,
     'num_iterations' : 10000,
     'learning_rate' : 0.02,
     'max_depth': 8,
}
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_train, label=y_train)
model = lgb.train(lgb_params, train_data, valid_sets=test_data)
# Predict and evaluate the model
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluate the predictions
# Calculate and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

print("actual")
print(y_test[:10].values)
print("pred")
print(y_pred[:10])

In [None]:
plt.figure(figsize=(5, 4))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('決定係数: {}'.format(round(r2, 2)))
plt.xlabel('充填率（正解値）')
plt.ylabel('充填率（予測値）')

# Plot a line representing perfect predictions
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
# 1. 特徴量の重要性を取得
feature_importances = model.feature_importance(importance_type='split')
feature_names = X_train.columns

# 2. 特徴量の名前とその重要性を組み合わせてDataFrameを作成
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# 3. DataFrameを重要性でソート
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 4. 特徴量の重要性を棒グラフで表示
plt.figure(figsize=(5, 100))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('予測における重要度')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
df = pd.read_csv('data/input/point_history_per_shop_date.csv', encoding='utf-8')

df = set_dtype(df)
df = replace_nan(df)
df = add_date_features(df)
df.loc[df["filling_rate"] > 1, "filling_rate"] = 1
df = set_previous_data(df, ['amount_kg', 'filling_rate'], days=28)
df = set_previous_data(df, ['amount_kg', 'filling_rate'], years=1)

In [None]:
df.loc[df['amount_kg'] > 3000, '年月日'].dt.strftime('%m-%d').value_counts()