In [2]:
import pandas as pd

train = pd.read_csv('./data/airline_dataset/train.csv')

print(f'train set은 {train.shape[1]} 개의 feature를 가진 {train.shape[0]} 개의 데이터 샘플로 이루어져 있습니다.')

train.head()

train set은 24 개의 feature를 가진 3000 개의 데이터 샘플로 이루어져 있습니다.


Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,target
0,1,Female,disloyal Customer,22,Business travel,Eco,1599,3,0,3,...,4,5,4,4,4,5,4,0,0.0,0
1,2,Female,Loyal Customer,37,Business travel,Business,2810,2,4,4,...,5,5,4,2,1,5,2,18,18.0,0
2,3,Male,Loyal Customer,46,Business travel,Business,2622,1,1,1,...,4,4,4,4,5,4,3,0,0.0,1
3,4,Female,disloyal Customer,24,Business travel,Eco,2348,3,3,3,...,3,2,4,5,3,4,3,10,2.0,0
4,5,Female,Loyal Customer,58,Business travel,Business,105,3,3,3,...,4,4,4,4,4,4,5,0,0.0,1


In [3]:
def check_missing_col(dataframe):
    missing_col = []
    for col in dataframe.columns:
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            print(f'결측치가 있는 컬럼은: {col} 입니다')
            print(f'해당 컬럼에 총 {missing_values} 개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if missing_col == []:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(train)

결측치가 존재하지 않습니다


In [4]:
train_x = train.drop(["id","target"],axis=1) 
train_y = train.target 

#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

# train 데이터 라벨 인코딩
label_map = make_label_map(train_x) # train 사용해 label map 생성
train_x = label_encoder(train_x, label_map) # train 라벨 인코딩

train_x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,1,22,1,1,1599,3,0,3,3,...,4,4,5,4,4,4,5,4,0,0.0
1,1,2,37,1,2,2810,2,4,4,4,...,3,5,5,4,2,1,5,2,18,18.0
2,2,2,46,1,2,2622,1,1,1,1,...,5,4,4,4,4,5,4,3,0,0.0
3,1,1,24,1,1,2348,3,3,3,3,...,3,3,2,4,5,3,4,3,10,2.0
4,1,2,58,1,2,105,3,3,3,3,...,5,4,4,4,4,4,4,5,0,0.0


In [5]:
from sklearn.preprocessing import MinMaxScaler

num_features = ['Age','Flight Distance','Departure Delay in Minutes','Arrival Delay in Minutes']

scaler = MinMaxScaler()
train_x[num_features] = scaler.fit_transform(train_x[num_features]) 
train_x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,1,0.205479,1,1,0.226501,3,0,3,3,...,4,4,5,4,4,4,5,4,0.0,0.0
1,1,2,0.410959,1,2,0.403807,2,4,4,4,...,3,5,5,4,2,1,5,2,0.015957,0.016143
2,2,2,0.534247,1,2,0.376281,1,1,1,1,...,5,4,4,4,4,5,4,3,0.0,0.0
3,1,1,0.232877,1,1,0.336164,3,3,3,3,...,3,3,2,4,5,3,4,3,0.008865,0.001794
4,1,2,0.69863,1,2,0.00776,3,3,3,3,...,5,4,4,4,4,4,4,5,0.0,0.0


In [6]:
import numpy as np

np.random.seed(1)

class LogisticRegression:
    
    # sigmoid 함수를 생성합니다
    def sigmoid(self, z): 
        return 1 / (1 + np.exp(-z))
    
    # lossfucntion 을 생성합니다
    def loss_function(self, X, y, weights): 
        total = len(X)
        z = np.dot(X, weights)
        prediction1 = y * np.log(self.sigmoid(z))
        prediction2 = (1 - y) * np.log(1 - self.sigmoid(z))
        return -sum(prediction1 + prediction2) / total
    
    #gradient decent 방법을 이용해 학습 함수를 생성합니다
    #learning rate 와 epochs 파라미터를 지정합니다
    def fit(self, X, y, epochs=500, lr=0.01):        
        loss = []
        weights = np.random.rand(X.shape[1])
        total = len(X)
                 
        for epoch in range(epochs):        
            y_hat = self.sigmoid(np.dot(X, weights))
            weights = weights - (lr * np.dot(X.T,  y_hat - y) / total)      
            loss.append(self.loss_function(X, y, weights)) 
            
        self.weights = weights
        self.loss = loss

    def predict(self, X):
        z = np.dot(X, self.weights)
        result = self.sigmoid(z)
        return result
    
    def to_bin(self,result):
        arr=[]
        for i in result:
            if i >0.5:
                arr.append(1)
            else : 
                arr.append(0)
        return arr

In [7]:
lr = LogisticRegression()
lr.fit(train_x,train_y)

In [10]:
test = pd.read_csv('./data/airline_dataset/test.csv')
test = test.drop(["id"],axis=1) 

test = label_encoder(test, label_map) #test data 라벨 인코딩

test[num_features] = scaler.transform(test[num_features]) #test 데이터 정규화

In [11]:
pred = lr.predict(test)
pred_bin = lr.to_bin(pred)

In [14]:
sample_submission = pd.read_csv('./data/airline_dataset/sample_submission.csv')

sample_submission.target = pred_bin
sample_submission.to_csv("submission_baseline.csv",index=False)