# 機械学習をPythonで実践する-19　　～ 特徴量エンジニアリング ～

In [1]:
%load_ext autoreload
%autoreload 2
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder, LabelEncoder, OneHotEncoder
# # import statsmodels.api as sma
from sklearn.model_selection import train_test_split ,cross_val_score, KFold, RepeatedKFold,StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, confusion_matrix,ConfusionMatrixDisplay, \
accuracy_score, precision_score, recall_score,precision_recall_curve,f1_score,roc_curve,auc,get_scorer_names,roc_auc_score
from sklearn.compose import ColumnTransformer
# from sklearn import tree
# from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from lightGBM_cv import lightGBM_classifier_cv_func
from category_encoders import TargetEncoder

%matplotlib inline
import matplotlib.pyplot as plt


## 日付データからの特徴量生成
下記のKaggleのデータを使ってやってみる。  
https://www.kaggle.com/competitions/bike-sharing-demand/data

### - Pandasの場合

In [17]:
df = pd.read_csv('../Python/sample_data/ML_sample/bike_share.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [18]:
# datetimeの型をチェック
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


datetime列は文字列になっている。

In [19]:
#　日付形式に変換
df['datetime'] = pd.to_datetime(df['datetime'])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


datetimeに変換された。

In [21]:
# 年のカラムを追加
df['year'] = df['datetime'].dt.year

# 月のカラムを追加
df['month'] = df['datetime'].dt.month

# １年の何日目かのカラムを追加
df['dayofyear'] = df['datetime'].dt.dayofyear

# １週間の何日目か(0:月～6:日)のカラムを追加
df['dayofweek'] = df['datetime'].dt.dayofweek

# １年の何週目かのカラムを追加
df['weekofyear'] = df['datetime'].dt.isocalendar().week

# ４半期のどれかを追加
df['quater'] = df['datetime'].dt.quarter

# うるう年であるか否かを追加
df['is_leap'] = df['datetime'].dt.is_leap_year

In [23]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,dayofyear,dayofweek,weekofyear,quater,is_leap
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,5,52,1,False
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,5,52,1,False
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,5,52,1,False
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,5,52,1,False
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,5,52,1,False


2011/1/1にもかかわらず、weekofyearが52週目となっているが、これはisocalender()では月曜日を１週間の区切りとしているため。  
この日は土曜日なのでまだ前年の週としてカウントされている。

### - Polarsの場合

In [2]:
df = pl.read_csv('../Python/sample_data/ML_sample/bike_share.csv')
df.head()

datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
str,i64,i64,i64,i64,f64,f64,i64,f64,i64,i64,i64
"""2011-01-01 00:…",1,0,0,1,9.84,14.395,81,0.0,3,13,16
"""2011-01-01 01:…",1,0,0,1,9.02,13.635,80,0.0,8,32,40
"""2011-01-01 02:…",1,0,0,1,9.02,13.635,80,0.0,5,27,32
"""2011-01-01 03:…",1,0,0,1,9.84,14.395,75,0.0,3,10,13
"""2011-01-01 04:…",1,0,0,1,9.84,14.395,75,0.0,0,1,1


strとして読み込まれてしまっているので、datetimeに変換する必要あり。  
（事前に型が分かっているのであれば、pl.read_csvの段階でdtypesを指定してしまうのが楽かも。）

In [3]:
# strからdatetimeに変換
df = df.with_columns(pl.col('datetime').str.strptime(pl.Datetime))
df.head()

datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime[μs],i64,i64,i64,i64,f64,f64,i64,f64,i64,i64,i64
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
df.with_columns([pl.col('datetime').dt.year().alias('year'),
                 pl.col('datetime').dt.month().alias('month'),
                 # １年の何日目かのカラムを追加 
                 pl.col('datetime').dt.ordinal_day().alias('day_of_year'),
                 # １週間の何日目か(1:月～7:日)のカラムを追加。pandasと違うことに注意 
                 pl.col('datetime').dt.weekday().alias('day_of_week'),
                 pl.col('datetime').dt.week().alias('week_of_year'),
                 pl.col('datetime').dt.quarter().alias('quarter'),
                 pl.col('datetime').dt.is_leap_year().alias('is_leap')
]).head()

# # 年のカラムを追加
# df['year'] = df['datetime'].dt.year

# # 月のカラムを追加
# df['month'] = df['datetime'].dt.month

# # １年の何日目かのカラムを追加
# df['dayofyear'] = df['datetime'].dt.dayofyear

# # １週間の何日目か(0:月～6:日)のカラムを追加
# df['dayofweek'] = df['datetime'].dt.dayofweek

# # １年の何週目かのカラムを追加
# df['weekofyear'] = df['datetime'].dt.isocalendar().week

# # ４半期のどれかを追加
# df['quater'] = df['datetime'].dt.quarter

# # うるう年であるか否かを追加
# df['is_leap'] = df['datetime'].dt.is_leap_year

datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day_of_year,day_of_week,week_of_year,quarter,is_leap
datetime[μs],i64,i64,i64,i64,f64,f64,i64,f64,i64,i64,i64,i32,u32,u32,u32,u32,u32,bool
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,6,52,1,False
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,6,52,1,False
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,6,52,1,False
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,6,52,1,False
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,6,52,1,False


Polarsの方がまとめて処理を書けるので楽かも。day_of_weekの扱いは微妙に違うので注意。