In [492]:
# 데이터 분석 및 정리용 패키지
import pandas as pd
import numpy as np
import random as rnd

# 데이터 시각화 패키지
import seaborn as sns
import matplotlib.pyplot as plt

# 머신러닝용 패키지
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# 데이터 불러오기

In [493]:
train_df = pd.read_csv('data\\train.csv')
test_df = pd.read_csv('data\\test.csv')
sub_df = pd.read_csv('data\\submission.csv')
combine = [train_df, test_df]

# 데이터 특징

In [494]:
print(train_df.columns.values)

['id' 'hour' 'hour_bef_temperature' 'hour_bef_precipitation'
 'hour_bef_windspeed' 'hour_bef_humidity' 'hour_bef_visibility'
 'hour_bef_ozone' 'hour_bef_pm10' 'hour_bef_pm2.5' 'count']


### 각 데이터 열

- id: 고유 id
- hour: 시간
- temperature: 기온
- precipitation: 비가 오지 않았으면 0, 비가 오면 1
- windspeed: 풍속(평균)
- humidity: 습도
- visibility: 시정(視程), 시계(視界)(특정 기상 상태에 따른 가시성을 의미)
- ozone: 오존
- pm10: 미세먼지(머리카락 굵기의 1/5에서 1/7 크기의 미세먼지)
- pm2.5: 미세먼지(머리카락 굵기의 1/20에서 1/30 크기의 미세먼지)
- count: 시간에 따른 따릉이 대여 수


# 데이터 형식

## 범주형 데이터(Categorical Data)

- Precipitation

## 수치형 데이터 (Numerical Data)
### 연속형

- hour, temperature, windspeed, humidity, visibility, ozone, pm10, pm2.5, count



In [495]:
# 데이터 요약
train_df.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0


# 데이터 분석


In [496]:
print('Train 데이터의 정보')
train_df.info()

Train 데이터의 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB


In [497]:
print('Test 데이터의 정보')
test_df.info()

Test 데이터의 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      715 non-null    int64  
 1   hour                    715 non-null    int64  
 2   hour_bef_temperature    714 non-null    float64
 3   hour_bef_precipitation  714 non-null    float64
 4   hour_bef_windspeed      714 non-null    float64
 5   hour_bef_humidity       714 non-null    float64
 6   hour_bef_visibility     714 non-null    float64
 7   hour_bef_ozone          680 non-null    float64
 8   hour_bef_pm10           678 non-null    float64
 9   hour_bef_pm2.5          679 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 56.0 KB


In [498]:
train_df.describe()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
count,1459.0,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,1105.914325,11.493489,16.717433,0.031572,2.479034,52.231297,1405.216884,0.039149,57.168736,30.327124,108.5634
std,631.338681,6.92279,5.23915,0.174917,1.378265,20.370387,583.131708,0.019509,31.771019,14.713252,82.631733
min,3.0,0.0,3.1,0.0,0.0,7.0,78.0,0.003,9.0,8.0,1.0
25%,555.5,5.5,12.8,0.0,1.4,36.0,879.0,0.0255,36.0,20.0,37.0
50%,1115.0,11.0,16.6,0.0,2.3,51.0,1577.0,0.039,51.0,26.0,96.0
75%,1651.0,17.5,20.1,0.0,3.4,69.0,1994.0,0.052,69.0,37.0,150.0
max,2179.0,23.0,30.0,1.0,8.0,99.0,2000.0,0.125,269.0,90.0,431.0


## Train 데이터의 각 열별 대여량 관련도

In [499]:
# 데이터간의 연관도 분석하는 테이블 도출 함수

def corr(value):
    return train_df[[value, 'count']].groupby([value], as_index=False).mean().sort_values(by='count', ascending=False)

In [500]:
col_list = ['hour', 'hour_bef_temperature', 'hour_bef_precipitation',
 'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
 'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2.5']

In [501]:
for cols in col_list:
    display(corr(cols))

Unnamed: 0,hour,count
18,18,262.163934
19,19,201.606557
17,17,187.133333
16,16,169.1
21,21,168.816667
20,20,164.868852
15,15,152.967213
22,22,148.245902
8,8,136.688525
14,14,134.590164


Unnamed: 0,hour_bef_temperature,count
241,29.5,431.000000
239,29.3,380.000000
216,26.5,324.000000
238,29.2,324.000000
210,25.9,318.666667
...,...,...
32,8.0,9.500000
22,6.8,8.000000
3,4.0,8.000000
2,3.3,7.000000


Unnamed: 0,hour_bef_precipitation,count
0,0.0,111.130404
1,1.0,33.673913


Unnamed: 0,hour_bef_windspeed,count
66,7.0,280.000000
44,4.4,232.153846
56,5.6,217.400000
57,5.7,210.500000
41,4.1,201.750000
...,...,...
2,0.2,37.785714
7,0.7,37.100000
71,7.7,21.000000
0,0.0,9.000000


Unnamed: 0,hour_bef_humidity,count
0,7.0,431.000000
1,8.0,317.000000
2,9.0,271.000000
4,11.0,244.250000
12,19.0,202.000000
...,...,...
90,99.0,27.571429
3,10.0,10.000000
86,93.0,4.500000
89,98.0,3.000000


Unnamed: 0,hour_bef_visibility,count
536,1507.0,398.0
746,1962.0,337.0
224,769.0,324.0
176,671.0,316.0
495,1381.0,305.0
...,...,...
411,1196.0,2.0
1,91.0,1.0
27,250.0,1.0
8,152.0,1.0


Unnamed: 0,hour_bef_ozone,count
93,0.112,324.000000
81,0.084,319.000000
85,0.091,302.500000
94,0.125,300.000000
88,0.095,286.000000
...,...,...
6,0.009,56.888889
2,0.005,51.470588
1,0.004,49.950000
23,0.026,46.850000


Unnamed: 0,hour_bef_pm10,count
100,113.0,303.000000
133,192.0,236.000000
132,180.0,187.000000
8,20.0,172.857143
129,169.0,172.000000
...,...,...
116,141.0,8.000000
0,9.0,8.000000
105,118.0,7.000000
138,205.0,6.000000


Unnamed: 0,hour_bef_pm2.5,count
3,11.0,176.785714
45,53.0,165.142857
40,48.0,156.166667
6,14.0,154.896552
2,10.0,154.222222
...,...,...
63,71.0,32.666667
67,75.0,28.000000
57,65.0,26.500000
72,82.0,15.000000


# 데이터 전처리

In [502]:
for cols in col_list:
    for dataset in combine:
        dataset[cols] = dataset[cols].fillna(dataset[cols].mean())

In [503]:
train_df.isna().sum()

id                        0
hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

## 연속형 데이터 범주화

In [504]:
for dataset in combine:
    dataset.rename(columns={'hour_bef_pm2.5':'hour_bef_pm2'}, inplace=True)

In [505]:
col_list = ['hour', 'hour_bef_temperature', 'hour_bef_precipitation',
 'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
 'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2']

In [506]:
for dataset in combine:
    dataset['hour_cut'] = pd.cut(dataset.hour, bins=[0, 5, 11, 17, 23], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_temperature_cut'] = pd.cut(dataset.hour_bef_temperature, bins=[3.1,12.8,16.6,20,30], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_windspeed_cut'] = pd.cut(dataset.hour_bef_windspeed, bins=[0,1.4,2.4,3.4,8], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_humidity_cut'] = pd.cut(dataset.hour_bef_humidity, bins=[7,36,51,68,99], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_visibility_cut'] = pd.cut(dataset.hour_bef_visibility, bins=[78,880,1576,1994,2000], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_ozone_cut'] = pd.cut(dataset.hour_bef_ozone, bins=[0.003,0.026,0.039,0.051,0.125], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_pm10_cut'] = pd.cut(dataset.hour_bef_pm10, bins=[9,37,53,68,269], labels=['A', 'B', 'C', 'D'])
    dataset['hour_bef_pm2_cut'] = pd.cut(dataset.hour_bef_pm2, bins=[8,21,28,36,90], labels=['A', 'B', 'C', 'D'])

In [507]:
train_df.isna().sum()

id                           0
hour                         0
hour_bef_temperature         0
hour_bef_precipitation       0
hour_bef_windspeed           0
hour_bef_humidity            0
hour_bef_visibility          0
hour_bef_ozone               0
hour_bef_pm10                0
hour_bef_pm2                 0
count                        0
hour_cut                    60
hour_bef_temperature_cut     1
hour_bef_windspeed_cut       3
hour_bef_humidity_cut        1
hour_bef_visibility_cut      1
hour_bef_ozone_cut           4
hour_bef_pm10_cut            1
hour_bef_pm2_cut             2
dtype: int64

## 최빈값으로 결측치 채우기

In [508]:
def freq(col):
    freq_port = train_df[col].dropna().mode()[0]
    return freq_port

def fill_freq(freq, col):
    for dataset in combine:
        dataset[col] = dataset[col].fillna(freq)

In [509]:
col_list_cut = ['hour_cut', 'hour_bef_temperature_cut',
 'hour_bef_windspeed_cut', 'hour_bef_humidity_cut', 'hour_bef_visibility_cut',
 'hour_bef_ozone_cut', 'hour_bef_pm10_cut', 'hour_bef_pm2_cut']

In [510]:
for cols in col_list_cut:
    for dataset in combine:
        fill_freq(freq(cols), cols)

In [511]:
train_df.isna().sum()

id                          0
hour                        0
hour_bef_temperature        0
hour_bef_precipitation      0
hour_bef_windspeed          0
hour_bef_humidity           0
hour_bef_visibility         0
hour_bef_ozone              0
hour_bef_pm10               0
hour_bef_pm2                0
count                       0
hour_cut                    0
hour_bef_temperature_cut    0
hour_bef_windspeed_cut      0
hour_bef_humidity_cut       0
hour_bef_visibility_cut     0
hour_bef_ozone_cut          0
hour_bef_pm10_cut           0
hour_bef_pm2_cut            0
dtype: int64

In [512]:
test_df.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2,hour_cut,hour_bef_temperature_cut,hour_bef_windspeed_cut,hour_bef_humidity_cut,hour_bef_visibility_cut,hour_bef_ozone_cut,hour_bef_pm10_cut,hour_bef_pm2_cut
0,0,7,20.7,0.0,1.3,62.0,954.0,0.041,44.0,27.0,B,D,A,C,B,C,B,B
1,1,17,30.0,0.0,5.4,33.0,1590.0,0.061,49.0,36.0,C,D,D,A,C,D,B,C
2,2,13,19.0,1.0,2.1,95.0,193.0,0.02,36.0,28.0,C,C,B,D,A,A,A,B
3,4,6,22.5,0.0,2.5,60.0,1185.0,0.027,52.0,38.0,B,D,C,C,B,B,B,D
4,5,22,14.6,1.0,3.4,93.0,218.0,0.041,18.0,15.0,D,B,C,D,A,C,A,A


In [513]:
for dataset in combine:
    dataset['hour_cut'] = dataset['hour_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_temperature_cut'] = dataset['hour_bef_temperature_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_windspeed_cut'] = dataset['hour_bef_windspeed_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_humidity_cut'] = dataset['hour_bef_humidity_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_visibility_cut'] = dataset['hour_bef_visibility_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_ozone_cut'] = dataset['hour_bef_ozone_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_pm10_cut'] = dataset['hour_bef_pm10_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)
    dataset['hour_bef_pm2_cut'] = dataset['hour_bef_pm2_cut'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3} ).astype(int)

In [514]:
train_df.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2,count,hour_cut,hour_bef_temperature_cut,hour_bef_windspeed_cut,hour_bef_humidity_cut,hour_bef_visibility_cut,hour_bef_ozone_cut,hour_bef_pm10_cut,hour_bef_pm2_cut
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0,3,1,1,3,0,1,3,2
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0,2,3,0,1,1,2,3,3
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0,1,1,0,3,1,1,0,0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0,3,0,2,2,1,2,3,3
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0,3,3,3,0,3,3,0,0


In [515]:
train_df = train_df.drop(['id'], axis = 1)

train_df = train_df.drop(['hour', 'hour_bef_temperature', 'hour_bef_precipitation',
   'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
   'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2'], axis=1)
test_df = test_df.drop(['hour', 'hour_bef_temperature', 'hour_bef_precipitation',
   'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
   'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2'], axis=1)

In [516]:
train_df.head()

Unnamed: 0,count,hour_cut,hour_bef_temperature_cut,hour_bef_windspeed_cut,hour_bef_humidity_cut,hour_bef_visibility_cut,hour_bef_ozone_cut,hour_bef_pm10_cut,hour_bef_pm2_cut
0,49.0,3,1,1,3,0,1,3,2
1,159.0,2,3,0,1,1,2,3,3
2,26.0,1,1,0,3,1,1,0,0
3,57.0,3,0,2,2,1,2,3,3
4,431.0,3,3,3,0,3,3,0,0


# 모델 생성 및 예측

* Logistic Regression
* KNN or k-Nearest Neighbors
* Support Vector Machines
* Naive Bayes classifier
* Decision Tree
* Random Forrest
* Perceptron
* Artificial neural network
* RVM or Relevance Vector Machine

In [517]:
X_train = train_df.drop("count", axis=1)
Y_train = train_df["count"]
X_test  = test_df.drop("id", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((1459, 8), (1459,), (715, 8))

In [518]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
print(acc_log)

8.36


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [519]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
print(acc_svc)

10.28


In [520]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
print(acc_knn)

32.9


In [521]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
print(acc_gaussian)

9.73


In [522]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
print(acc_perceptron)

1.71


In [523]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
print(acc_linear_svc)

7.2




In [524]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
print(acc_sgd)

2.06


In [525]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print(acc_decision_tree)

77.24


In [526]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(acc_random_forest)

77.24


In [527]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,77.24
8,Decision Tree,77.24
1,KNN,32.9
0,Support Vector Machines,10.28
4,Naive Bayes,9.73
2,Logistic Regression,8.36
7,Linear SVC,7.2
6,Stochastic Gradient Decent,2.06
5,Perceptron,1.71


In [528]:
"""
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
print(acc_knn)
"""

'\nknn = KNeighborsClassifier(n_neighbors = 3)\nknn.fit(X_train, Y_train)\nY_pred = knn.predict(X_test)\nacc_knn = round(knn.score(X_train, Y_train) * 100, 2)\nprint(acc_knn)\n'

In [529]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print(acc_decision_tree)

77.24


In [530]:
submission = pd.DataFrame({
        "id": test_df["id"],
        "count": Y_pred
    })

In [531]:
submission.to_csv('output\\submission.csv', index=False)