In [None]:
from itertools import islice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
from tqdm import tqdm 
import sys
from datetime import datetime, timedelta, time
from scipy.optimize import curve_fit
from scipy import stats
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns

# カレントディレクトリを.pyと合わせるために以下を実行
from pathlib import Path
if Path.cwd().name == "notebook":
    os.chdir("..")

# 親ディレクトリをsys.pathに追加
sys.path.append(os.path.dirname(os.getcwd()))

# Windows MatplotlibのデフォルトフォントをMeiryoに設定
plt.rcParams['font.family'] = 'Meiryo'


# 設定
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 500)
pd.set_option('display.max_columns', 500)

# 自作モジュール
from utils.point_history_utils import open_point_history_per_shop, aggregate_date, replace_nan, set_dtype
from RS_filliing_rate.RS_fillingrate_test import plot_recycle_period, chi_squared_statistic, exp_func, power_law, KS_statistic, calc_recycle_period


In [None]:


# 日付特徴量の追加
def add_date_features(df):
    df["month"] = df["年月日"].dt.month
    df["day"] = df["年月日"].dt.day
    df["year"] = df["年月日"].dt.year

    df["day_sin"] = np.sin(df["day"] / 31 * 2* np.pi)
    df["day_cos"] = np.cos(df["day"] / 31 * 2* np.pi)
    df.drop(columns=["day"], inplace=True)
    
    df["month_sin"] = np.sin(df["month"] / 31 * 2* np.pi)
    df["month_cos"] = np.cos(df["month"] / 31 * 2* np.pi)
    df.drop(columns=["month"], inplace=True)

    return df

df = pd.read_csv('data/input/point_history_per_shop_date.csv', encoding='utf-8')

df = set_dtype(df)
df = replace_nan(df)
df = add_date_features(df)
df.loc[df["filling_rate"] > 1, "filling_rate"] = 1
df['day_of_week'] = df['年月日'].dt.day_name()

# Drop unnecessary columns
columns_to_drop = ['series_id', 'shop_id', 'shop_name', 'shop_id_1', 'リサイクル分類ID', '支店ID', 'store_opening_time',\
                    'store_closing_time', 'rps_opening_time', 'rps_closing_time','年月日', 'interval_compared_to_next', \
                        'amount','amount_kg','point','total_point','total_amount','coin', 'interval_compared_to_previous', 'total_amount_kg_per_day',\
                            'store_latitude', 'store_longitude', 'total_amount_kg_per_day', 'store_']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [None]:
df.head()

In [None]:


# Handle categorical variables with one-hot encoding
categorical_columns = ['prefectures', 'municipality','shop_name_1','super', '天気', 'day_of_week']
df = pd.get_dummies(df, columns=categorical_columns)

# Split the data into features and target
X = df.drop('filling_rate', axis=1)
y = df['filling_rate']

In [None]:
df.head()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the LightGBM model
train_data = lgb.Dataset(X_train, label=y_train)
param = {'num_leaves': 31, 'objective': 'regression'}
num_round = 100
bst = lgb.train(param, train_data, num_round)

# Predict and evaluate the model
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Evaluate the predictions
# Calculate and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

print("actual")
print(y_test[:10].values)
print("pred")
print(y_pred[:10])

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Plot a line representing perfect predictions
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, linestyle='--')
plt.show()

In [None]:
# 1. 特徴量の重要性を取得
feature_importances = bst.feature_importance(importance_type='split')
feature_names = X_train.columns

# 2. 特徴量の名前とその重要性を組み合わせてDataFrameを作成
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# 3. DataFrameを重要性でソート
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 4. 特徴量の重要性を棒グラフで表示
plt.figure(figsize=(10, 100))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()