In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('all_data.csv', index_col = 0).dropna()

# Converting all features to binary

In [14]:
df.head(3)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied


In [15]:
def encode_categ(df, col, value1, rename_to = None):
    df[col] = np.where(df[col] == value1, 1, 0)
    if rename_to is not None:
        df = df.rename(columns = {col: rename_to})
    return df

def encode_num(df, col, th, rename_to = None):
    df[col] = np.where(df[col] > th, 1, 0)
    if rename_to is not None:
        df = df.rename(columns = {col: rename_to})
    return df

In [16]:
df = encode_categ(df, 'Gender', 'Male', 'Male')
df = encode_categ(df, 'Customer Type', 'Loyal Customer', 'Loyal')
df = encode_categ(df, 'Type of Travel', 'Business travel', 'Business travel')
df = encode_categ(df, 'Class', 'Business', 'Business class')
df = encode_categ(df, 'satisfaction', 'satisfied', 'target')

sat_cols = ['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
            'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 
            'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service',
            'Cleanliness']

df = encode_num(df, 'Age', 17, '18+')
df = encode_num(df, 'Flight Distance', 1000, '1000+')
df = encode_num(df, 'Departure Delay in Minutes', 0, 'Departure delay')
df = encode_num(df, 'Arrival Delay in Minutes', 0, 'Arrival delay')

for col in sat_cols:
    df = encode_num(df, col, 3)

In [17]:
df.head(3)

Unnamed: 0,id,Male,Loyal,18+,Business travel,Business class,1000+,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure delay,Arrival delay,target
0,70172,1,1,0,0,0,0,0,1,0,...,1,1,0,1,1,1,1,1,1,0
1,5047,1,0,1,1,1,0,0,0,0,...,0,0,1,0,0,1,0,1,1,0
2,110028,0,1,1,1,1,1,0,0,0,...,1,1,0,1,1,1,1,0,0,1


# Train and test subsamples

In [18]:
seed = 42

In [19]:
train0 = df[df['target'] == 0].sample(1000, random_state = seed)
train1 = df[df['target'] == 1].sample(1000, random_state = seed)
train = pd.concat([train0, train1])
train = train.sample(frac = 1, random_state = seed).reset_index(drop = True)

In [20]:
test = df[~df['id'].isin(train['id'])].sample(500, random_state = seed)

In [21]:
train = train.drop(columns = ['id'])
test = test.drop(columns = ['id'])

In [22]:
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)