In [1]:
%matplotlib inline
import csv
import urllib.request
import codecs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
origin = pd.read_csv('AFSNT.csv', encoding="cp949")
origin.head()

Unnamed: 0,SDT_YY,SDT_MM,SDT_DD,SDT_DY,ARP,ODP,FLO,FLT,REG,AOD,IRR,STT,ATT,DLY,DRR,CNL,CNR
0,2017,1,1,일,ARP1,ARP3,A,A1901,SEw3Nzc2,D,N,6:10,6:18,N,,N,
1,2017,1,1,일,ARP1,ARP3,A,A1905,SEw4MjM2,D,N,6:15,6:25,N,,N,
2,2017,1,1,일,ARP1,ARP3,L,L1751,SEw4MjM3,D,N,6:20,6:30,N,,N,
3,2017,1,1,일,ARP1,ARP3,F,F1201,SEw4MjA3,D,N,6:25,6:34,N,,N,
4,2017,1,1,일,ARP3,ARP1,A,A1900,SEw3NzAz,D,N,6:30,6:37,N,,N,


In [3]:
#시간전처리와 요일 전처리 빼고는 model 돌리기 전에 하기!

# 시간에서 시 데이터만 추출 ==> 분은 영향을 줄임.
origin['ATT'] = pd.to_datetime(origin['ATT'],format= '%H:%M').dt.hour
origin['STT'] = pd.to_datetime(origin['STT'],format= '%H:%M').dt.hour

In [4]:
origin.rename(columns={'SDT_YY':'Year', 'SDT_MM':'Month', 'SDT_DD':'DAY'}, inplace=True)


#요일 카테고리컬
one_hot_dy = pd.get_dummies(origin['SDT_DY'])
origin = origin.drop(['SDT_DY'],axis = 1)
origin = origin.join(one_hot_dy)
origin.rename(columns={"일":"Sun","월":"Mon","화":"Tue","수":"Wed","목":"Thu","금":"Fri","토":"SAT","일":"Sun"                    
                  }, inplace=True)

In [5]:
origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987709 entries, 0 to 987708
Data columns (total 23 columns):
Year     987709 non-null int64
Month    987709 non-null int64
DAY      987709 non-null int64
ARP      987709 non-null object
ODP      987709 non-null object
FLO      987709 non-null object
FLT      987709 non-null object
REG      987251 non-null object
AOD      987709 non-null object
IRR      987709 non-null object
STT      987709 non-null int64
ATT      979461 non-null float64
DLY      987709 non-null object
DRR      119917 non-null object
CNL      987709 non-null object
CNR      3018 non-null object
Fri      987709 non-null uint8
Thu      987709 non-null uint8
Wed      987709 non-null uint8
Mon      987709 non-null uint8
Sun      987709 non-null uint8
SAT      987709 non-null uint8
Tue      987709 non-null uint8
dtypes: float64(1), int64(4), object(11), uint8(7)
memory usage: 127.2+ MB


In [6]:
# 1일 이라고 되어있는 데이터를 합칠떄 01로 변환
def changeDate(data):
    data=str(data)
    if len(data)==1:
        data="0"+data
    return data

In [7]:
#군공항 제외날씨 데이터 다운(항공데이터)

def downloadAirport(yy,mm,area):
    mm=changeDate(mm)
    yy=str(yy)
    url='http://amoapi.kma.go.kr/amoApi/air_stcs?icao='+area+'&yyyymm='+yy+mm
    response = urllib.request.urlopen(url)
    cr = csv.reader(codecs.iterdecode(response, 'utf-8'))
    
    #url로 읽어와 데이터 프레임에 저장
    temp=[]
    for line in cr:
        temp.append(line)
    
    labels=temp[0]
    weather=pd.DataFrame.from_records(temp[1:],columns=labels)
    
    weather["TM"]=weather["TM"].astype("str")

    weather["Year"]=weather["TM"].str.slice(0,4)
    weather["Month"]=weather["TM"].str.slice(4,6)
    weather["DAY"]=weather["TM"].str.slice(6,8)
    hh=weather["TM"].str.slice(8,10)    

    #weather['DAY'] = pd.to_datetime(weather[['Year', 'Month', 'DAY']])
    weather['STT']=hh
    weather['STT']= weather['STT'].astype('int')
    weather['STT']=weather['STT'].replace(24,0)
    
    
    weather.drop(columns=["TM"], axis=1, inplace=True)
    weather.drop(columns=["WD","WS_GST","RVR1","RVR2","RVR3","RVR4","CLA_1LYR"
               ,"BASE_1LYR","CLF_1LYR","CLA_2LYR","BASE_2LYR","CLF_2LYR",
               "CLA_3LYR","BASE_3LYR","CLF_3LYR","CLA_4LYR","BASE_4LYR","CLF_4LYR"], axis=1, inplace=True)
    
    
    #기록 안되어있는거는 0
    weather=weather.fillna(0)
    weather["Year"]=weather["Year"].astype("int")
    weather["Month"]=weather["Month"].astype("int")
    weather["DAY"]=weather["DAY"].astype("int")
    
    return weather 

In [8]:
#군공항 데이터 저장(기상청 csv)
def downloadWeather(year):
    filename="data/"+str(year)+".csv"
    weather=pd.read_csv(filename,encoding="cp949")
    
    #일시를 데이트타입과 시간을 따로 저장
    weather["일시"] = weather["일시"].astype('str')
    date=weather["일시"].str.split(expand=True)
    day=date[0].str.split("-",expand=True)
    
    weather["Year"]=day[0]
    weather["Month"]=day[1]
    weather["DAY"]=day[2]
    
    
    weather["STT"]=date[1]
    
    #일시 drop(변경 전 데이터)
    weather.drop(columns=['일시'], axis=1, inplace=True)
    weather = weather.drop(['지면온도(°C)',"지면온도 QC플래그","5cm 지중온도(°C)","10cm 지중온도(°C)","20cm 지중온도(°C)","기온 QC플래그",
                            "강수량 QC플래그","풍속 QC플래그","풍향(16방위)","풍향 QC플래그","습도 QC플래그","현지기압 QC플래그",
                            "해면기압 QC플래그","중하층운량(10분위)","운형(운형약어)","최저운고(100m )","지면상태(지면상태코드)",
                           "적설(cm)","3시간신적설(cm)", "30cm 지중온도(°C)","일조(hr)","일조 QC플래그","일사(MJ/m2)"],axis = 1)
    weather.rename(columns={"지점" : "area", "기온(°C)":"temp" ,"강수량(mm)":"rain", "풍속(m/s)":"windSpeed",
                            "습도(%)":"hum","증기압(hPa)":"Vapor","이슬점온도(°C)":"dew","현지기압(hPa)":"hpa",
                            "해면기압(hPa)":"seeHpa","시정(10m)":"visible","전운량(10분위)":"cloudTotal",
                            "현상번호(국내식)":"weatherCode"}, inplace=True)
    weather["STT"]=weather["STT"].astype("str")
    weather["STT"]=weather["STT"].str.split(":",expand=True)[0]
    weather["STT"]=weather["STT"].astype("int")
    
    weather["Year"]=weather["Year"].astype("int")
    weather["Month"]=weather["Month"].astype("int")
    weather["DAY"]=weather["DAY"].astype("int")
    
    weather=weather.fillna(0)
    return weather

In [9]:
# 군공항X(공항공사 데이터 돌리기)
def mergeAirportData():
    df_all=pd.DataFrame()
    temp=origin[(origin.ARP=="ARP1") | (origin.ARP=="ARP3")| (origin.ARP=="ARP5") | (origin.ARP=="ARP7") 
                | (origin.ARP=="ARP9")| (origin.ARP=="ARP10") ]
    elements,count=np.unique(temp["ARP"],return_counts=True)
    for i in range(len(elements)):
        df_areaD=pd.DataFrame()
        arp=elements[i]
        area={"ARP1":"RKSS","ARP3":"RKPC","ARP5":"RKPU",
              "ARP7":"RKJB","ARP9":"RKJY","ARP10":"RKNY"}.get(arp)
        df_areaD=temp[temp["ARP"]==arp]
        
        for j in range(3):
            year=2017+j
            for k in range(12):
                ## 2019년은 7월데이터 X
                if year==2019:
                    if k>=6:
                        break;
                month=1+k
                df_date=df_areaD[(df_areaD["Year"]==year) & (df_areaD["Month"]==month)]
                weather=downloadAirport(year,month,area)
                df_new=pd.merge(df_areaD,weather,on=["Year","Month","DAY","STT"])
                
                if i==0 and j==0 and k==0:
                    df_all=df_new.copy()
                else:
                    df_all=df_all.append(df_new)
                    
    #기상청데이터와 연결 하기 위해 rename                
    df_all.rename(columns={'WSPD':'windSpeed','VIS':"visible","TMP":"temp",
                      "TD":"dew",'PS':'hpa','PA':'seeHpa','RN':'rain','HM':'hum',
                        'CA_TOT':'cloudTotal','WC':"weatherCode"}, inplace=True)
    
    
    #기상청 데이터와 단위 맞추기
    df_all['temp']=df_all['temp'].astype("float")/10
    df_all['hpa']=df_all['hpa'].astype("float")/10
    df_all['seeHpa']=df_all['seeHpa'].astype("float")/10
    df_all['dew']=df_all['dew'].astype("float")/10
    df_all['windSpeed']=df_all['windSpeed'].astype("float")*1852/3600
    
    return df_all

In [10]:
def mergeWeatherData():
    temp=origin[(origin.ARP=="ARP2") | (origin.ARP=="ARP4")| (origin.ARP=="ARP6") | (origin.ARP=="ARP8") | (origin.ARP=="ARP11")
                | (origin.ARP=="ARP12")| (origin.ARP=="ARP13")| (origin.ARP=="ARP14")|(origin.ARP=="ARP15")]
    elements,count=np.unique(temp["ARP"],return_counts=True)
    
    for i in range(3):
        df_areaD=pd.DataFrame()
        year=2017+i
        weather=downloadWeather(year)
        df_yearD=origin[origin["Year"]==year]
        for j in range(len(elements)):
            arp=elements[j]
            area={"ARP2":159,"ARP4":143,"ARP6":131, "ARP8":156,
                  "ARP11":138,"ARP12":192, "ARP13":140,"ARP14":114,"ARP15":112}.get(arp)
            
            df_areaD=df_yearD[df_yearD["ARP"]==arp]
            df_new=pd.DataFrame()
            weatherT=weather[weather["area"]==area]
           
            df_new=pd.merge(df_areaD,weatherT,on=["DAY","STT","Year","Month"])
        
            if i==0 and j==0:
                df_all=df_new.copy()
            else:
                df_all=df_all.append(df_new)
    return df_all

In [11]:
df_airport=mergeAirportData()
df_nonAirport=mergeWeatherData()

df_weather=df_airport.append(df_nonAirport)

In [12]:
df_weather.drop(['area','Vapor'], axis = 1, inplace=True)

In [13]:
df_weather.to_csv("weatherFinal.csv",index=False)

In [None]:
df = pd.read_csv('weatherFinal.csv', encoding="cp949")
df.head()

In [None]:
#전처리 코드 추가(나중에 한 파일로 낼꺼 생각해서)

##전처리 코드 추가
df = df[df.IRR != "Y"] # 부정기 없애기 
df = df[df.CNL != "Y"]

# 비행기 취소와 관련된 Column 삭제
df.drop(columns=['CNL', 'CNR'], axis=1, inplace=True)

# 사용되지 않을 것 같은 데이터 일단 삭제
df.drop(columns=['REG', 'IRR'], axis=1, inplace=True)

# 딜레이 이유 (나중에 쓰일 듯)
df.drop(columns=['DRR'], axis=1, inplace=True)

############# 날씨 데이터 추가 후 주석 제거할 것.
# 날씨 관련 안쓰는 feature 삭제
# df.drop(columns=['rain', 'weatherCode'], axis=1, inplace=True)



In [None]:
# ATT (actual time data)가 널 값인 레코드 삭제 
df = df[pd.notnull(df['ATT'])]
df.isnull().sum()

In [None]:
# ARP와 ODP가 같은 데이터 --> Wrong data => 삭제
df.drop(df[df['ARP'] == df['ODP']].index, inplace=True)


In [None]:
# ARP 경로 파생변수 생성
df['ARPODP'] = df['ARP'] + '_' + df['ODP']
df.head()

In [None]:
from datetime import datetime, date

df['Diff'] = (pd.to_datetime(df['ATT'],format= '%H:%M') - pd.to_datetime(df['STT'],format= '%H:%M')).dt.seconds.astype('int64')

# STT와 ATT 격차 큰 순대로 정렬
df = df.sort_values(by=['Diff'], ascending=False)

########################################################################출발
# 딜레이가 최대 5시간이라고 가정했을 때, --> 즉, 2시간 초과한 딜레이는 wrong값이라 가정
max_delay_hour = 5
max_delay = max_delay_hour * 3600 # seconds

# 출발비행기의 경우, 조금이라도 출발이 빠른 건 wrong data라 판단.
# 7200보다 큰 값을 가지는 Diff 데이터 wrong 값 처리
df = df[((df['Diff'] <= max_delay) & (df['AOD']=='D')) | (df['AOD']=='A')]

df.head(100)

########################################################################도착
#이정도는 늦게 도착해도 O
#2시간은 예상보다 늦게도착할 수 있다. 그 이상은 말이안된다
max_delay_hour_arr = 5
max_delay_arr = max_delay_hour_arr * 3600 # seconds

#몇분 일찍도착해도 O
#30분은 예상보다 빨리도착할 수 있음. 그거보다 빨리도착하는 건 말이 안됨
min_delay = 30*60
min_delay = 86400 - min_delay  # 86400(24시간)보다 위인거만 살려놓기
df = df[(df['AOD']=='D') |((df['Diff'] <= max_delay_arr) & (df['AOD']=='A')) | ((df['AOD']=='A')& (df['Diff'] >= min_delay )) ]
df.loc[df['Diff'] >=min_delay, 'Diff'] = df.loc[df['Diff'] >=min_delay, 'Diff']  - 86400
df.head(100)

In [None]:

# Check the numerical data
numerical_feature = [col for col in df.columns if df[col].dtypes == 'int64']
print(numerical_feature)

def dist_box(df, feature_list):
    for col in feature_list:
        plt.figure(figsize=(15, 5))
        sns.distplot(df.loc[df[col].notnull(), col])
        plt.title(col)
        plt.show()
        
        df[col].plot(kind='box', color='red')
        plt.show()

"""
print('*'*50)
print('All')
dist_box(df, numerical_feature)
print('*'*50)
print('Arrive')
dist_box(df[df['AOD'] == 'A'], numerical_feature)
print('*'*50)
print('Departure')
dist_box(df[df['AOD'] == 'D'], numerical_feature)
"""

In [None]:
# Categorical data --> one hot encoding

def one_hot_dummies(df, *args):
    for col in args:
        one_hot_col = pd.get_dummies(df[col])
        df = df.drop([col], axis = 1)
        
        try:
            df = df.join(one_hot_col)
        except:
            one_hot_col.rename(columns={'ARP1':'ARP1_','ARP2':'ARP2_','ARP3':'ARP3_','ARP4':'ARP4_','ARP5':'ARP5_',
                                        'ARP14':'ARP14_','ARP12':'ARP12_','ARP10':'ARP10_','ARP8':'ARP8_','ARP6':'ARP6_',
                                        'ARP15':'ARP15_','ARP13':'ARP13_','ARP11':'ARP11_','ARP9':'ARP9_','ARP7':'ARP7_'},
                               inplace=True)
            df = df.join(one_hot_col)
    return df


df = one_hot_dummies(df, 'ARP', 'ODP', 'FLO', 'ARPODP')

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['Month', 'STT']] = scaler.fit_transform(df[['Month', 'STT']])

In [None]:
# Target 레이블링

from sklearn import preprocessing
# DLY도 1과 0으로 데이터 처리
le = preprocessing.LabelEncoder()
df[['DLY']] = le.fit_transform(df[['DLY']])

In [None]:
# 도착, 출발 데이터 분리
df_A = df[df['AOD']=='A']
df_D = df[df['AOD']=='D']

# AOD column삭제 
df_A = df_A.drop(['AOD'],axis = 1)
df_D = df_D.drop(['AOD'],axis = 1)

df_A.head(5)

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
def makeTestSet(df):
    X = df.drop(['DLY'], axis = 1)
    y = df['DLY']
        
    X_tr, X_t, y_tr, y_t = train_test_split(X,y, test_size= 0.3, random_state = 42)
    
    print("X_train set--------------------")
    print("Shape:",X_tr.shape)
    print("Target:")
    print(y_tr.value_counts())
    print()
      
    print("X_test set info-----------------")
    print("Shape:",X_t.shape)
    print("Target:")
    print(y_t.value_counts())
    print()

    return [X_tr, X_t, y_tr, y_t]

X_train_A, X_test_A, y_train_A, y_test_A = makeTestSet(df_A)
X_train_D, X_test_D, y_train_D, y_test_D = makeTestSet(df_D)

In [None]:
# --------------변수 중요도 확인하고 상위 OO개 남기기 -------------------

def feature_importance(X_train, y_train, X_test, y_test):
        
    #### Skew Data처리할거면 주석 해제하기!!
    #X_train, y_train = imbalance(X_train, y_train)
    
    
    log_rg = LogisticRegression().fit(X_train, y_train)
    cross_val_score(log_rg, X_train, y_train, cv=5)
    log_rg.score(X_test, y_test)
    print(classification_report(y_test, log_rg.predict(X_test)))

    # X column 개수 출력
    #print(len(X.columns)) 

    # 변수 중요도 
    fi = pd.DataFrame(zip(X_train.columns.values, abs(log_rg.coef_.ravel())))
    fi.columns = ['feature', 'coef']
    fi.sort_values("coef", ascending=False, inplace=True)
    fi = fi.reset_index().drop(['index'], axis=1)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(fi)

    for index, val in enumerate(fi.iloc[:, 1]):
        if val < 0.1:
            X_train.drop([fi.iloc[index, 0]], axis = 1, inplace = True)
            X_test.drop([fi.iloc[index, 0]], axis = 1, inplace = True)   
    return

feature_importance(X_train_A, y_train_A, X_test_A, y_test_A)
feature_importance(X_train_D, y_train_D, X_test_D, y_test_D)
print(len(X_train_A.columns))
print(len(X_test_A.columns))
print(len(X_train_D.columns))
print(len(X_test_D.columns))

#--------------------------------------------------------------------

In [None]:
def imbalance (X_train, y_train):

    # 모델설정
    sm = SMOTE(ratio='auto', kind='regular')

    # train데이터를 넣어 복제함
    X_resampled, y_resampled = sm.fit_sample(X_train,list(y_train))

    print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape))
    print('After OverSampling, the shape of train_y: {} \n'.format(X_resampled.shape))

    print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
    print("After OverSampling, counts of label '0': {}".format(sum(y_resampled==0)))
    
    return [X_resampled, y_resampled]

In [None]:
# ROC Curve그리기
def plot_roc_curve(fpr, tpr):
    
    
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
def confusion_matrix_heatmap(y_true, y_pred):
    data = confusion_matrix(y_true, y_pred)
    df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    sns.set(font_scale=1.4)#for label size
    sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size

In [None]:
def runModel (X_train, y_train, X_test, y_test, base = True):
    
    #X_train, y_train = underSampling(X_train, y_train)
    
     ## Skew Data처리할거면 주석 풀기!!
    X_train, y_train = imbalance(X_train, y_train)
    
    models = []
    
    if base ==True:
        models.append(('RF', RandomForestClassifier(max_depth=6, n_estimators=1000, random_state=0, criterion='entropy')))
    else:
#         models.append(('LR', LogisticRegression()))
#         models.append(('LDA', LinearDiscriminantAnalysis()))
#         models.append(('KNN', KNeighborsClassifier()))
#         models.append(('CART', DecisionTreeClassifier()))
#         models.append(('NB', GaussianNB()))
        models.append(('RF', RandomForestClassifier(max_depth=4, n_estimators=100, random_state=0)))
#         models.append(('SVM', SVC(gamma='auto')))
#         models.append(('XGB', XGBClassifier()))

    
    # 평가
    results = []
    names = []
    scoring = 'recall'

    seed = 7
    for name, model in models:
        # K-Fold
#         kfold = KFold(n_splits=10, random_state=seed)
#         cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
#         results.append(cv_results)
#         names.append(name)
#         msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#         print(msg)

        # Hold out 
        model.fit(X_train, y_train)
        y_predict = pd.Series(model.predict(X_test))

        # Resets index to compare original test data with predicted data
        y_test = y_test.reset_index(drop=True)
        y_predict = y_predict.reset_index(drop=True)

#         plt.figure(figsize=(15, 5))
#         plt.scatter(range(y_test.shape[0]), y_test, c='gray')
#         plt.scatter(range(y_predict.shape[0]), y_predict , c='r')
#         diff = abs(y_test - y_predict)
#         plt.bar(range(diff.shape[0]), diff, color='gray')
#         plt.title('Result - Original comparsion')
#         plt.legend(['Original', 'Predict'])
#         plt.show()

        print(model.score(X_test, y_test))
        print('-' * 50)
        
        #--------ROC Curve-----------------
        probs = model.predict_proba(X_test)
        probs = probs[:, 1]
        auc = roc_auc_score(y_test, probs)
        print('AUC: %.2f' % auc)
        
        fpr, tpr, thresholds = roc_curve(y_test, probs)
        plot_roc_curve(fpr, tpr)
        #-----------------------------------
        
        #-------- Confusion matrix heatmap -----------------
        confusion_matrix_heatmap(y_test, y_predict)
        print(classification_report(y_test, y_predict))
        #-----------------------------------
        
             
#      # boxplot algorithm comparison
#     fig = plt.figure(figsize=(11,6))
#     fig.suptitle('Algorithm Comparison')
#     ax = fig.add_subplot(111)
#     plt.boxplot(results)
#     ax.set_xticklabels(names)
#     plt.show()
    
    return

In [None]:
# True면 base algorithm만 실행하겠다 (base algorithm : Random Forest)
# False면 모든 알고리즘을 실행하겠다.
runModel (X_train_D,y_train_D, X_test_D, y_test_D, True)

In [None]:
# True면 base algorithm만 실행하겠다 (base algorithm : Random Forest)
# False면 모든 알고리즘을 실행하겠다.
runModel (X_train_A,y_train_A, X_test_A, y_test_A, True)

In [None]:
#우리가 예측해야 할 데이터
dly = pd.read_csv('AFSNT_DLY.CSV', encoding="cp949")
dly.head()


In [None]:
# 필요없는 column 제거
dly.drop(columns=['FLT', 'DLY', 'DLY_RATE'], axis=1, inplace=True)

In [None]:
dly.head()

In [None]:
# 파생변수
dly['ARPODP'] = dly['ARP'] + '_' + dly['ODP']
# 분 삭제
dly['STT'] = pd.to_datetime(dly['STT'],format= '%H:%M').dt.hour

In [None]:
import urllib
import json
import pandas as pd
import dateutil.parser
#날씨데이터 json 읽어오는 함수

#나중에 9월 1일을 9월 16일로 바꾼 후 실행..
#(읽어오는거 확인용으로 9월 1일 부터...)
def readJSON(area):
    
    request = urllib.request.urlopen('https://api.aerisapi.com/forecasts/'+area+',korea?from=09/16/2019&format=json&filter=1hr&limit=999&client_id=gHOhinKWCL1fwDUpI1Ec7&client_secret=TRamBTyXpORXcmDKVqc2S4i4mnCjVxxHMt6cllui')
    response = request.read()
    data= json.loads(response)


    if data['success']:
        return data

    else:
        print("An error occurred: %s" % (data['error']['description']))
        return ""
        request.close()

In [None]:
#날씨 csv 저장
def storeWeather():
    futureweather=pd.DataFrame(columns=['SDT_YY', 'SDT_MM', 'SDT_DD','STT','ARP','temp', 'hum', 'dew', 'windSpeed','hpa'])

    for i in range(15):
        area=["seoul","busan","jeju","daegu","ulsan","cheongju","muan","gwangju","yeosu"
              ,"yangyang","pohang","sacheon","gunsan","wonju","incheon"]
        d=readJSON(area[i])
        response=d['response'][0]
        data=response['periods']
    
    
        for j in range(len(data)):
            temp=data[j]
            date=dateutil.parser.parse(temp['dateTimeISO'])
            temp2=pd.Series([date.year,date.month,date.day,date.hour,"ARP"+str(i+1),temp['tempC'],
                         temp['humidity'],temp['dewpointC'],temp['windGustKTS'],temp['pressureMB']],
                        index=['SDT_YY', 'SDT_MM', 'SDT_DD','STT','ARP','temp', 'hum', 'dew', 'windSpeed','hpa'])
        
            futureweather=futureweather.append(temp2,ignore_index=True)
    
    #fog 모델하고의 단위 맞추는 작업
    futureweather['windSpeed']=futureweather['windSpeed'].astype("float")*1852/3600
    
    futureweather.to_csv("newWeather.csv",index=False)

In [None]:
# csv 파일 저장하고 싶을때만 실행..
storeWeather() 

# 날씨데이터 합치기---------------------

we=pd.read_csv('newWeather.csv', encoding="cp949")

final= pd.merge(we, dly, on=['SDT_YY', 'SDT_MM', 'SDT_DD', 'ARP','STT'])

In [None]:
def predictFiveDays(df2):
    import warnings
    warnings.filterwarnings(action="ignore")

    for i in range(6):
        update=[]
        for j in range(24):
            newTemp=0.0
            newHum=0.0
            newDew=0.0
            newWindSpeed=0.0
            newHpa=0.0
            newDew=0.0
            for k in range(5):
                newTemp += df2[(df2['SDT_DD']==25+i-k+1) & (df2['STT']==j)]['temp']
                newHum += df2[(df2['SDT_DD']==25+i-k+1) & (df2['STT']==j)]['hum']
                newDew += df2[(df2['SDT_DD']==25+i-k+1) & (df2['STT']==j)]['dew']
                newWindSpeed += df2[(df2['SDT_DD']==25+i-k+1) & (df2['STT']==j)]['windSpeed']
                newHpa += df2[(df2['SDT_DD']==25+i-k+1) & (df2['STT']==j)]['hpa']
                #newDew += df[(df['SDT_DD']==25+i-k+1) & (df['STT']==j)]['dew']
#            print(df[(df['SDT_DD']==25+i-k) & (df['STT']==j)]['temp'])
            df2[(df2['SDT_DD']==25+i) & (df2['STT']==j)]['temp']= newTemp / 5
            df2[(df2['SDT_DD']==25+i) & (df2['STT']==j)]['hum']= newHum / 5
            df2[(df2['SDT_DD']==25+i) & (df2['STT']==j)]['dew']= newDew / 5
            df2[(df2['SDT_DD']==25+i) & (df2['STT']==j)]['windSpeed']= newWindSpeed / 5
        
    print(df2)

In [None]:
# YY랑 DD삭제
final.drop(columns=['SDT_YY', 'SDT_DD'], axis=1, inplace=True)

In [None]:
# 일단 뺌
final = one_hot_dummies(final, 'SDT_MM', 'SDT_DY', 'ARP', 'ODP', 'FLO',  'STT','ARPODP')

In [None]:
final.columns

In [None]:
from sklearn.preprocessing import RobustScaler
import pickle
from sklearn.externals import joblib

def fogModel(df):
    # 날씨 missing 값들은 0으로 대체
    df.fillna(0, inplace = True)
    
    # 모델에서 쓰인 Scaling기법 적용
    scaler = RobustScaler()
    df[['hum', 'dew','temp','windSpeed']] = scaler.fit_transform(df[['hum', 'dew','temp','windSpeed']])
    
    # 저장된 모델 불러오기
    clf_from_joblib = joblib.load('fogmodel.pkl') 

    # 지연 율 저장
    fog_prob = clf_from_joblib.predict_proba(df)
    

    fog_column = []
    # dly_rate에 지연율 저장
    for i in fog_prob:
        fog_column.append(i[1])
        
    
    return fog_column

In [None]:
# df에서 fog관련 column만 함수에 넘김
fog = final[["temp","hum","dew","windSpeed"]]

fog_column = fogModel(fog)

In [None]:
# 안개 column 추가
final['fog'] = fog_column

# 안개 관련 column 제거
final.drop(columns=['hum', 'dew','temp','windSpeed'], axis=1, inplace=True)
final.head()

In [None]:
# 도착, 출발 데이터 분리
final_A = final[final['AOD']=='A']
final_D = final[final['AOD']=='D']

# AOD column삭제 
final_A = final_A.drop(['AOD'],axis = 1)
final_D = final_D.drop(['AOD'],axis = 1)

In [None]:
# ------------------------------ #

In [None]:
########## 아직 저장된 모델이 없음

# df_A 모델 -----------------------------------------------

# df_A 모델 불러오기
predict_dealy_A_joblib = joblib.load('predict_delay_A.pkl') 

# DLY 저장하기
dly_A = predict_dealy_A_joblib.predict(final_A)

# DLY_RATE 저장하기
dly_A_prob = predict_dealy_A_joblib.predict_proba(final_A)

dly_rate_A = []

# dly_rate에 지연율 저장
for i in dly_A_prob:
    dly_rate_A.append(i[1])
    
# DateFrame에 DLY, DLY_RATE추가
final.loc[final['AOD']=='A','DLY'] = dly_A
final.loc[final['AOD']=='A','DLY_RATE'] = dly_rate_A

# ---------------------------------------------------------

In [None]:
# df_D 모델 -----------------------------------------------

# df_D 모델 불러오기
predict_dealy_D_joblib = joblib.load('predict_delay_D.pkl') 

# DLY 저장하기
dly_D = predict_dealy_D_joblib.predict(final_D)

# DLY_RATE 저장하기
dly_A_prob = predict_dealy_A_joblib.predict_proba(final_D)

dly_rate_D = []

# dly_rate에 지연율 저장
for i in dly_D_prob:
    dly_rate_D.append(i[1])
    
# DateFrame에 DLY, DLY_RATE추가
final.loc[final['AOD']=='D','DLY'] = dly_D
final.loc[final['AOD']=='D','DLY_RATE'] = dly_rate_D

# ---------------------------------------------------------

In [None]:
# 라벨을 1,0에서 -> Y,N으로 
final.loc[final['DLY']==1,'DLY'] = 'Y'
final.loc[final['DLY']==0,'DLY'] = 'N'