In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
pd.set_option('display.max_columns', None)

In [2]:
def load():
    df = pd.read_csv("/Users/huseyinefkanalp/Desktop/DataScience/Miuul/FeatureEngineering/datasets/titanic.csv")
    return df

In [3]:
df = load()
df.shape

(891, 12)

In [19]:
#Hangisi kategorik hangisi numerik bulan fonksiyon
def grab_col_names(df, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.
    
    Parameters
    -------
    	df: dataframe
        		Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
        		numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optional
        		kategorik fakat kardinal değişkenler için sınıf eşik değeri
    Returns
    ------
    	cat_cols: list
        		Kategorik değişken listesi
        num_cols: list
        		Numerik değişken listesi
        
    """
    
    # cat_cols, cat_but_car
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > car_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    # num_cols
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car

In [4]:
# sutun isimlerini buyultelim
df.columns = [col.upper() for col in df.columns]

## Feature Engineering (Degisken muhendisligi)

In [6]:
# Cabin Bool. Cabin bilgisi verildiyse 1 verilmediyse 0
df['NEW_CABIN_BOOLE'] = df['CABIN'].notnull().astype('int')

In [None]:
# Name count. name degiskenlerinin uzunlugu
df['NEW_NAME_COUNT'] = df['NAME'].str.len()

In [7]:
# Name word count. Name degiskenin de kac adet kelime barindiriyor
df['NEW_NAME_WORD_COUNT'] = df['NAME'].apply(lambda x : len(str(x).split(' ')))

In [10]:
# Name dr. Eger name degiskenleri dr ile basliyorsa bunlari yeni degiskende belirt
df['NEW_NAME_DR'] = df['NAME'].apply(lambda x : len([x for x in x.split() if x.startswith('Dr')]))

In [12]:
# New Title. mr mi mrs mi bunlari aldik
df['NEW_TITLE'] = df.NAME.str.extract(' ([A-Za-z]+)\.', expand=False)

In [13]:
# Family size. aile buyuklugu
df['NEW_FAMILY_SIZE'] = df['SIBSP'] + df['PARCH'] + 1

In [14]:
# Age Pclass
df['NEW_AGE_PCLASS'] = df['AGE']* df['PCLASS']

In [15]:
#is alone
df.loc[((df['SIBSP']+df['PARCH']) > 0), 'NEW_IS_ALONE'] = 'NO'
df.loc[((df['SIBSP']+df['PARCH']) == 0), 'NEW_IS_ALONE'] = 'Yes'

In [16]:
# age level
df.loc[(df['AGE']<18),'NEW_AGE_CAT'] = 'young'
df.loc[(df['AGE']>=18)&(df['AGE']<56),'NEW_AGE_CAT'] = 'mature'
df.loc[(df['AGE']>=56),'NEW_AGE_CAT'] = 'senior'

In [17]:
# sex x age
df.loc[(df['SEX'] == 'male')& (df['AGE']<=21), 'NEW_SEX_CAT'] = 'youngmale'
df.loc[(df['SEX'] == 'male')& (df['AGE']>21) & (df['AGE']<=50), 'NEW_SEX_CAT'] = 'maturemale'
df.loc[(df['SEX'] == 'male')& (df['AGE'] > 50), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['SEX'] == 'female')& (df['AGE']<=21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['SEX'] == 'female')& (df['AGE']>21) & (df['AGE']<=50), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['SEX'] == 'female')& (df['AGE'] > 50), 'NEW_SEX_CAT'] = 'seniorfemale'

In [18]:
df.head()

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED,NEW_CABIN_BOOLE,NEW_NAME_WORD_COUNT,NEW_NAME_DR,NEW_TITLE,NEW_FAMILY_SIZE,NEW_AGE_PCLASS,NEW_IS_ALONE,NEW_AGE_CAT,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,4,0,Mr,2,66.0,NO,mature,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,7,0,Mrs,2,38.0,NO,mature,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,3,0,Miss,1,78.0,Yes,mature,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,7,0,Mrs,2,35.0,NO,mature,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,4,0,Mr,1,105.0,Yes,mature,maturemale


In [20]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 891
Variables: 21
cat_cols: 14
num_cols: 4
cat_but_car: 3
num_but_cat: 8


In [21]:
num_cols

['PASSENGERID', 'AGE', 'FARE', 'NEW_AGE_PCLASS']

In [22]:
# passenger id numerik degil o yuzden bunu num_cols dan atmamiz gerekli
num_cols = [col for col in num_cols if 'PASSENGERID' not in col]