In [1]:
# SDA Term Project
# 60201673 박건우, 60201674 박상재, 60191556 이재훈

# Import Module
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Font for Korean
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# Data Load
data1 = 'seoulAirQuality.csv'
air = pd.read_csv(data1 , encoding = 'CP949')

In [2]:
# Data Preprocessing
import numpy as np

# Rename Columns
air.rename(columns = {"일시" : "date", "구분" : "district", "미세먼지(PM10)" : "PM10", "초미세먼지(PM25)" : "PM25"}, inplace = True)

# 'date' 열이 날짜 형식이 아니라면 변환
air['date'] = pd.to_datetime(air['date'])

# 'month'와 'day' 열이 없다면 'date' 열을 기반으로 생성
air['month'] = air['date'].apply(lambda x: x.month)
air['day'] = air['date'].apply(lambda x: x.day)

# 'month'와 'day' 열의 데이터 타입이 숫자가 아니거나 결측값이 있다면 변환 또는 처리
air[['month', 'day']] = air[['month', 'day']].apply(pd.to_numeric, errors='coerce')

# 'season' 열 생성
conditions_season = [
    (air['month'] >= 3) & (air['month'] <= 5),
    (air['month'] >= 6) & (air['month'] <= 8),
    (air['month'] >= 9) & (air['month'] <= 11),
    (air['month'] == 12) | (air['month'] == 1) | (air['month'] == 2)
]

choices_season = ['spring', 'summer', 'autumn', 'winter']

air['season'] = np.select(conditions_season, choices_season, default=np.nan)
air

Unnamed: 0,date,district,PM10,PM25,month,day,season
0,2021-12-31 23:00:00,평균,21.0,9.0,12,31,winter
1,2021-12-31 23:00:00,강남구,21.0,9.0,12,31,winter
2,2021-12-31 23:00:00,강동구,25.0,7.0,12,31,winter
3,2021-12-31 23:00:00,강북구,23.0,12.0,12,31,winter
4,2021-12-31 23:00:00,강서구,28.0,9.0,12,31,winter
...,...,...,...,...,...,...,...
456137,2020-01-01 00:00:00,용산구,12.0,13.0,1,1,winter
456138,2020-01-01 00:00:00,은평구,18.0,12.0,1,1,winter
456139,2020-01-01 00:00:00,종로구,20.0,15.0,1,1,winter
456140,2020-01-01 00:00:00,중구,22.0,18.0,1,1,winter


In [3]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 계절 범주를 더미 변수로 변환
df_dummies = pd.get_dummies(air['season'], dummy_na = True)

# 더미 변수들을 데이터프레임에 추가
air = pd.concat([air, df_dummies], axis=1)

# 모델 구축
# 기준 범주인 '가을'은 모델에 이미 포함되어 있으므로 명시하지 않음.
model = ols('PM10 ~ spring + summer + autumn + winter', data=air).fit()

# 결과 요약
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   PM10   R-squared:                       0.064
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     7678.
Date:                Sun, 10 Dec 2023   Prob (F-statistic):               0.00
Time:                        16:07:39   Log-Likelihood:            -2.2141e+06
No. Observations:              446550   AIC:                         4.428e+06
Df Residuals:                  446545   BIC:                         4.428e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   8.334e+11   1.18e+12      0.704      0.4

In [4]:
# 겨울 - 봄 구간과 여름 - 가을 구간을 더미 변수로 재설정 후 regression 진행
air['snw'] = air['season'].apply(lambda x : 1 if x in ['spring', 'winter'] else 0)

# model 구축
model2 = ols('PM10 ~ snw', data = air).fit()

# 결과 요약
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                   PM10   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.057
Method:                 Least Squares   F-statistic:                 2.718e+04
Date:                Sun, 10 Dec 2023   Prob (F-statistic):               0.00
Time:                        16:07:39   Log-Likelihood:            -2.2157e+06
No. Observations:              446550   AIC:                         4.431e+06
Df Residuals:                  446548   BIC:                         4.431e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     28.2290      0.073    385.710      0.0

In [5]:
# PM10과 PM2.5의 Regression 분석
model3 = ols('PM10 ~ PM25', data = air).fit()
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:                   PM10   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.553
Method:                 Least Squares   F-statistic:                 5.507e+05
Date:                Sun, 10 Dec 2023   Prob (F-statistic):               0.00
Time:                        16:07:39   Log-Likelihood:            -2.0395e+06
No. Observations:              444383   AIC:                         4.079e+06
Df Residuals:                  444381   BIC:                         4.079e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.1760      0.058     55.032      0.0