## Max depth를 지정하지 않고 분석

In [1]:
# 연산 처리를 위한 패키지
import numpy as np
import pandas as pd

# 데이터 분석을 위한 패키지
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# 시각화를 위한 패키지
from matplotlib import pyplot as plt
import seaborn as sns
import graphviz

# 그래프를 실제로 그리기 위한 설정
%matplotlib inline

# 경고 메세지 무시
#import warnings
#warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_cloudSea = pd.read_csv("/content/drive/MyDrive/빅데이터응용/팀 프로젝트/크롤링/final_crawled_api_data.csv")

In [5]:
column_to_drop = ['title', 'link', 'upload_to', 'mountain', '위치']
df_cloudSea = df_cloudSea.drop(columns = column_to_drop)
df_cloudSea

Unnamed: 0,운해 여부,전날 기온,전날 강수량,전날 습도,전날 기압,일출 기온,일출 풍속,일출 습도,일출 기압,일출 전운량,일교차
0,1.0,5.8,0.0,84.0,1001.4,3.2,0.0,93.0,1003.0,0.0,-2.6
1,1.0,8.9,0.0,91.0,1012.2,9.3,0.8,92.0,1012.4,7.0,0.4
2,1.0,12.1,0.0,87.0,1001.3,10.1,2.4,91.0,1002.7,7.0,-2.0
3,1.0,10.0,0.0,83.0,998.3,7.9,2.5,87.0,998.9,0.0,-2.1
4,1.0,1.7,0.0,100.0,987.3,-0.3,0.0,100.0,989.1,10.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...
802,0.0,-10.8,0.0,89.0,941.9,-13.1,0.3,87.0,941.8,0.0,-2.3
803,0.0,-4.5,0.0,61.0,1008.7,-7.0,0.6,72.0,1008.7,0.0,-2.5
804,0.0,8.5,0.0,76.0,1015.7,7.7,0.3,85.0,1015.7,0.0,-0.8
805,0.0,-5.0,0.0,73.0,1009.7,-5.8,0.0,83.0,1009.7,0.0,-0.8


In [6]:
nan_rows = df_cloudSea[df_cloudSea.isna().any(axis=1)]
print(nan_rows)

     운해 여부  전날 기온  전날 강수량  전날 습도   전날 기압  일출 기온  일출 풍속  일출 습도   일출 기압  일출 전운량  \
246    1.0  -10.9     0.0   90.0  1003.7  -12.7    NaN   88.0  1004.5     0.0   
474    0.0   -6.5     0.0    NaN  1010.5   -9.7    0.7    NaN  1011.7     0.0   
806    NaN    NaN     NaN    NaN     NaN    NaN    NaN    NaN     NaN     NaN   

     일교차  
246 -1.8  
474 -3.2  
806  NaN  


In [7]:
df_cloudSea = df_cloudSea.drop(246)
df_cloudSea = df_cloudSea.drop(474)
df_cloudSea = df_cloudSea.drop(806)

In [8]:
df_cloudSea

Unnamed: 0,운해 여부,전날 기온,전날 강수량,전날 습도,전날 기압,일출 기온,일출 풍속,일출 습도,일출 기압,일출 전운량,일교차
0,1.0,5.8,0.0,84.0,1001.4,3.2,0.0,93.0,1003.0,0.0,-2.6
1,1.0,8.9,0.0,91.0,1012.2,9.3,0.8,92.0,1012.4,7.0,0.4
2,1.0,12.1,0.0,87.0,1001.3,10.1,2.4,91.0,1002.7,7.0,-2.0
3,1.0,10.0,0.0,83.0,998.3,7.9,2.5,87.0,998.9,0.0,-2.1
4,1.0,1.7,0.0,100.0,987.3,-0.3,0.0,100.0,989.1,10.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...
801,1.0,24.0,0.0,90.0,990.1,24.1,0.0,90.0,989.8,0.0,0.1
802,0.0,-10.8,0.0,89.0,941.9,-13.1,0.3,87.0,941.8,0.0,-2.3
803,0.0,-4.5,0.0,61.0,1008.7,-7.0,0.6,72.0,1008.7,0.0,-2.5
804,0.0,8.5,0.0,76.0,1015.7,7.7,0.3,85.0,1015.7,0.0,-0.8


In [9]:
korean_column_names = ['운해 여부', '전날 기온', '전날 강수량', '전날 습도', '전날 기압', '일출 기온', '일출 풍속', '일출 습도', '일출 기압', '일출 전운량', '일교차']
english_column_names = ['isCloudSea', 'Previous Temperature', 'Previous Precipitation', 'Previous Humidity', 'Previous Pressure', 'Sunrise Temperature', 'Sunrise Wind Speed', 'Sunrise Humidity', 'Sunrise Pressure', 'Sunrise Cloud Coverage', 'Diurnal Temperature Range']

new_column_names = {}
for korean, english in zip(korean_column_names, english_column_names):
    new_column_names[korean] = english

print(new_column_names)

{'운해 여부': 'isCloudSea', '전날 기온': 'Previous Temperature', '전날 강수량': 'Previous Precipitation', '전날 습도': 'Previous Humidity', '전날 기압': 'Previous Pressure', '일출 기온': 'Sunrise Temperature', '일출 풍속': 'Sunrise Wind Speed', '일출 습도': 'Sunrise Humidity', '일출 기압': 'Sunrise Pressure', '일출 전운량': 'Sunrise Cloud Coverage', '일교차': 'Diurnal Temperature Range'}


In [10]:
df_cloudSea = df_cloudSea.rename(columns = new_column_names)
df_cloudSea

Unnamed: 0,isCloudSea,Previous Temperature,Previous Precipitation,Previous Humidity,Previous Pressure,Sunrise Temperature,Sunrise Wind Speed,Sunrise Humidity,Sunrise Pressure,Sunrise Cloud Coverage,Diurnal Temperature Range
0,1.0,5.8,0.0,84.0,1001.4,3.2,0.0,93.0,1003.0,0.0,-2.6
1,1.0,8.9,0.0,91.0,1012.2,9.3,0.8,92.0,1012.4,7.0,0.4
2,1.0,12.1,0.0,87.0,1001.3,10.1,2.4,91.0,1002.7,7.0,-2.0
3,1.0,10.0,0.0,83.0,998.3,7.9,2.5,87.0,998.9,0.0,-2.1
4,1.0,1.7,0.0,100.0,987.3,-0.3,0.0,100.0,989.1,10.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...
801,1.0,24.0,0.0,90.0,990.1,24.1,0.0,90.0,989.8,0.0,0.1
802,0.0,-10.8,0.0,89.0,941.9,-13.1,0.3,87.0,941.8,0.0,-2.3
803,0.0,-4.5,0.0,61.0,1008.7,-7.0,0.6,72.0,1008.7,0.0,-2.5
804,0.0,8.5,0.0,76.0,1015.7,7.7,0.3,85.0,1015.7,0.0,-0.8


In [14]:
df_cloudSea.to_csv('df_cloudSea.csv')

In [11]:
from sklearn.model_selection import train_test_split


feature_columns = df_cloudSea.columns.difference(['isCloudSea'])

X = df_cloudSea[feature_columns]
y = df_cloudSea['isCloudSea']

# 데이터를 훈련 데이터와 검증 데이터로 나눔
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=777)

# max_depth를 설정 하지 않고, random_state은 동일 한 결과 값을 얻기 위해 설정
dt = tree.DecisionTreeClassifier(random_state=777)
dt.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score

y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.6900826446280992


In [13]:
feature_names = list(X_train.columns)

tree.export_graphviz(dt, out_file='tree_no_pruning.dot', filled=True, feature_names=feature_names, 
                                impurity=True, rounded=True, class_names=['Survived 0','Survived 1'])

# PNG 파일로 컨버팅 
from subprocess import call
call(['dot', '-Tpng', 'tree_no_pruning.dot', '-o', 'tree_no_pruning.png', '-Gdpi=200'])

# 쥬피터 노트북에서 출력
from IPython.display import Image
Image(filename = 'tree_no_pruning.png')

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# 가장 중요한 feature
sns.barplot(x=dt.feature_importances_, y=feature_names)