# **LGBM**

In [1]:
### 사용할 라이브러리
import pandas as pd ### 데이터 분석을 하기 위한 파이썬 라이브러리
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### 데이터 전처리

In [2]:
df=pd.read_csv("/content/drive/MyDrive/mosquito_final_air_data.csv")

In [3]:
df= df.drop(['날짜','요일','모기 활동 지수(수변부)','모기 발생 단계(수변부)','모기 발생 수준(수변부)','모기 발생 난이도(수변부)',
 '모기 활동 지수(주거지)','모기 발생 단계(주거지)','모기 발생 수준(주거지)','모기 발생 난이도(주거지)',
 '모기 활동 지수(공원)','모기 발생 단계(공원)','모기 발생 수준(공원)','모기 발생 난이도(공원)',
 '포집량','모기','기타','모기 발생 단계(서울시 평균)','모기 발생 수준(서울시 평균)',
 '모기 발생 난이도(서울시 평균)','지점','지점명','Unnamed: 0'],axis=1)

In [4]:
#컬럼명 수정
df.rename(columns={
 '평균 기온(℃)':'AT', #Average temperature
 '최저 기온(℃)':'LT', #Lowest temperature
 '최고 기온(℃)':'MT', #Maximum temperature
 '모기 활동 지수(서울시 평균)':'MAI', #Mosquito activity index
 '평균 풍속(m/s)':'AWS', #average wind speed
 '평균 전운량(1/10)':'AOE', #Average amount of electricity
 '평균 지면온도(°C)':'AGT', #average ground temperature
 '평균 5cm 지중온도(°C)':'5UT',#5cm average underground temperature
 '0.5m 지중온도(°C)':'0.5UT',#0.5 m underground temperature
 'Month':'Month',
 '일강수량(mm)':'DP', #daily precipitation
 '평균 상대습도(%)':'ARH', #Average relative humidity
 '평균 증기압(hPa)':'AVP',#average vapor pressure
 '평균 현지기압(hPa)':'MLAP', #mean local atmospheric pressure
 '평균 해면기압(hPa)':'ASSP', #average sea surface pressure
 '합계 일사량(MJ/m2)':'TSR', #total solar radiation
 '미세먼지(㎍/㎥)':'FD',#Fine dust
 '초미세먼지(㎍/㎥)':'UFD', #Ultra fine dust
 '이산화질소농도(ppm)':'NDC', #Nitrogen dioxide concentration
 '일산화탄소농도(ppm)':'CMC', #carbon monoxide concentration
 '아황산가스농도(ppm)':'SDC'#Sulfur dioxide concentration
 },inplace=True)


In [5]:
#기온의 값은 기상청에서 찾은 값으로 채움
df['LT'] = df['LT'].fillna(27.2)
df['MT'] = df['MT'].fillna(23.4)
#대기 데이터의 null값은 많지 않으므로 dropna
df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
AT,0
LT,0
MT,0
MAI,0
AWS,0
AOE,0
AGT,0
5UT,0
0.5UT,0
Month,0


In [6]:
numerical_columns = ['AT', 'LT', 'MT', 'AWS', 'AOE','AGT', '5UT', '0.5UT', 'Month', 'DP',
 'ARH', 'AVP', 'MLAP', 'ASSP','TSR','FD', 'UFD','NDC','CMC','SDC']

#### 모델링

In [8]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

lgbm = LGBMRegressor()

feature_cols = ['AT', 'LT', 'MT', 'AWS', 'AOE',
 'AGT', '5UT', '0.5UT', 'Month', 'DP',
 'ARH', 'AVP', 'MLAP', 'ASSP',
 'TSR', 'FD', 'UFD', 'NDC', 'CMC', 'SDC']

target_col = ['MAI']

X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], test_size=0.2, random_state=42)

X_test = X_test[feature_cols]

lgbm.fit(X_train, y_train)

# 테스트 데이터 예측
pred_test = lgbm.predict(X_test)

y_test = y_test.values.ravel()
pred_test = pred_test.ravel()

pred = lgbm.predict(X_test)

mae = mean_absolute_error(y_test, pred)
mae

### R square
print(lgbm.score(X_train, y_train)) # training set
print(lgbm.score(X_test, y_test)) # test set

# RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

# training set
pred_train = lgbm.predict(X_train)
print(sqrt(mean_squared_error(y_train, pred_train)))

# test set
print(sqrt(mean_squared_error(y_test, pred_test)))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3286
[LightGBM] [Info] Number of data points in the train set: 2012, number of used features: 20
[LightGBM] [Info] Start training from score 142.120080
0.9798477417182748
0.7766739453208051
33.521512897339136
107.47219125361858
