In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate , StratifiedKFold
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#Data set 준비 ,train set, test set, submission <- 제출해야일 파일
mypath = '/content/drive/MyDrive/titanic/'
train = pd.read_csv(mypath + 'train.csv')
test = pd.read_csv(mypath + 'test.csv')
submission = pd.read_csv(mypath + 'gender_submission.csv')


train['TrainSplit']='Train'
test['TrainSplit']='Test'
data = pd.concat([train, test], axis=0)

data_all = data.reindex( columns = ['Pclass', 'Age', 'SibSp','Parch','Fare',
                                    'Survived'])

selected_features = ['Pclass','Age','SibSp','Parch','Fare']
train = data_all.loc[data['TrainSplit'] == 'Train',selected_features]
target = data_all.loc[data['TrainSplit'] == 'Train','Survived']
test = data_all.loc[data['TrainSplit'] == 'Test',selected_features]

train_x, val_x, target_y, val_target_y = \
tts(train, target, test_size = 0.2 , shuffle=True)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
#sex int화 
data.loc[data['Sex'] == 'female','Sex'] = 0
data.loc[data['Sex'] == 'male','Sex'] = 1
data['Sex'] = data['Sex'].astype(int)

#fare 결측값 
fare_mean = data.loc[data['Pclass']==3,'Fare'].mean()
data['Fare']=data['Fare'].fillna(fare_mean)
data.loc[152]

#name-title 분류 
title_name = data['Name'].str.split(",", expand=True)[1]
title = title_name.str.split(".",expand=True)[0]
#name to title 
title = title.replace([' Ms'],'Miss')
title = title.replace([' Mlle',' the Countess',' Lady',' Don',
                       ' Dona',' Jonkheer',' Mme',' Sir'],'Noble')
title = title.replace([' Col',' Major',' Capt'],'Officer')
title = title.replace([' Dr',' Rev'],'Priest')
title = title.replace([' Miss'],'Miss')
title = title.replace([' Mr'],'Mr')
title = title.replace([' Mrs'],'Mrs')
title = title.replace([' Master'],'Master')

data['Title'] = np.array(title)
data['Title'].value_counts()

data = data.drop('Name',axis=1)


#embarked 결측값
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

#cabin 삭제
data = data.drop('Cabin',axis=1)

#Age 결측값
for title in data['Title'].unique():
  midage = data.loc[data['Title']==title, 'Age'].median()
  data.loc[data['Title']==title, 'Age'] = \
  data.loc[data['Title']==title, 'Age'].fillna(midage)
#Agebin 생성 : age범위값을 str
bins = [0,4,8,12,16,32,36,48,56,64,100]
labels = ['Infant', 'Child1','Child2','Youth1','Youth2','Adult1','Adult2',
          'Middle Aged','Senior','Elderly']
data['Agebin'] = pd.cut(data['Age'], bins = bins, labels = labels)

#ticket 전처리
data["Ticket"] = data["Ticket"].str.replace('.','').str.replace('/','')
data["Ticket"] = data["Ticket"].str.strip().str.split(' ').str[0]
data["Ticket"].value_counts()
#숫자는 -> NUM으로 
data.loc[data["Ticket"].str.isdigit(),'Ticket'] = 'NUM'
data["Ticket"].value_counts()[:10]


NUM        957
PC          92
CA          68
A5          28
SOTONOQ     24
WC          15
SCPARIS     14
STONO       14
A4          10
FCC          9
Name: Ticket, dtype: int64

In [3]:
#라벨링 
from sklearn.preprocessing import LabelEncoder as lee
for col in ['Title', 'Agebin']:
  le = lee()
  data[col] = le.fit_transform(data[col])

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,TrainSplit,Title,Agebin
0,1,0.0,3,1,22.0,1,0,A5,7.25,S,Train,2,9
1,2,1.0,1,0,38.0,1,0,PC,71.2833,C,Train,3,1
2,3,1.0,3,0,26.0,0,0,STONO2,7.925,S,Train,1,9
3,4,1.0,1,0,35.0,1,0,NUM,53.1,S,Train,3,0
4,5,0.0,3,1,35.0,0,0,NUM,8.05,S,Train,2,0


In [4]:
onehot = []
for col in ['Embarked','Ticket']:
  data[col]= data[col].astype('category')
  data = pd.get_dummies(data, columns = [col], prefix = col[:3],drop_first= True)
  onehot.append(col[:3])
  
scaler = MinMaxScaler()
scaled_cols = [col for col in data.loc[:,'Pclass':].columns if col!='TrainSplit']

data_scaled = data.loc[:,scaled_cols]
data_scaled = scaler.fit_transform(data_scaled)

data.loc[:,scaled_cols] = data.loc[:,:]
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,TrainSplit,Title,Agebin,Emb_Q,Emb_S,Tic_A4,Tic_A5,Tic_AQ3,Tic_AQ4,Tic_AS,Tic_C,Tic_CA,Tic_CASOTON,Tic_FC,Tic_FCC,Tic_Fa,Tic_LINE,Tic_LP,Tic_NUM,Tic_PC,Tic_PP,Tic_PPP,Tic_SC,Tic_SCA3,Tic_SCA4,Tic_SCAH,Tic_SCOW,Tic_SCPARIS,Tic_SCParis,Tic_SOC,Tic_SOP,Tic_SOPP,Tic_SOTONO2,Tic_SOTONOQ,Tic_SP,Tic_STONO,Tic_STONO2,Tic_STONOQ,Tic_SWPP,Tic_WC,Tic_WEP
0,1,0.0,3,1,22.0,1,0,7.25,Train,2,9,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1.0,1,0,38.0,1,0,71.2833,Train,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1.0,3,0,26.0,0,0,7.925,Train,1,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,1.0,1,0,35.0,1,0,53.1,Train,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0.0,3,1,35.0,0,0,8.05,Train,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
selected_feature =[]
for col in data:
  if (data[col] != data(['PassengerId','Survived','Age','TrainSplit'])):
    selected_feature.append(col)
selected_feature.shape

TypeError: ignored

In [None]:
data[col] != data(['PassengerId','Survived','Age','TrainSplit'])

TypeError: ignored