In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msnofrom
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler,RobustScaler

In [2]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.float_format",lambda x: '%.3f' % x)
pd.set_option("display.width", 500)

In [21]:
def load():
    data=pd.read_csv("titanic.csv")
    return data
def load_application_train():
    data=pd.read_csv("application_train.csv")
    return data
def grab_col_names(dataframe,cat_th=10,car_th=20):
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtypes=="O"] #tipi obcejt olanları al
    num_but_cat=[col for col in dataframe.columns if dataframe[col].nunique()<cat_th and
             dataframe[col].dtypes!="O"] #numerik görünen kategörikleri aldık
    cat_but_car=[col for col in dataframe.columns if dataframe[col].nunique()>car_th and
                dataframe[col].dtypes=="O"] #kategorik görünen kardinalleri aldık.
    
    cat_cols=cat_cols+num_but_cat #kategörikler güncellendi
    cat_cols=[col for col in cat_cols if col not in cat_but_car] #cat_cols içinde kardinalleri ayırdık
    
    num_cols=[col for col in dataframe.columns if dataframe[col].dtypes!="O"] #tipi objectden farkı olanları getir
    num_cols=[col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations(Gözlem):{dataframe.shape[0]}")
    print(f"Variables(Öznitelik): {dataframe.shape[1]}")
    print(f"cat_cols:{len(cat_cols)}")
    print(f"num_cols:{len(num_cols)} ")
    print(f"cat_but_car:{len(cat_but_car)} ")
    print(f"num_but_cat:{len(num_but_cat)} ")
    
    return cat_cols,num_cols,cat_but_car
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def chech_outlier(dataframe,col_name):
    low_limit,up_limit=outlier_thresholds(dataframe,col_name)
    if dataframe[(dataframe[col_name]>up_limit) | (dataframe[col_name]<low_limit)].any(axis=None):
        return True
    else:
        return False
    
def grab_outliers(dataframe,col_name,index=False):
    low,up=outlier_thresholds(dataframe,col_name)

    if dataframe[((dataframe[col_name]<low) | (dataframe[col_name]>up))].shape[0]>10:
        print(dataframe[((dataframe[col_name]<low)) | (dataframe[col_name]>up)].head())
    else:
        print(dataframe[((dataframe[col_name]<low)) | (dataframe[col_name]>up)])

    if index:
        outlier_index=dataframe[((dataframe[col_name]<low)| (dataframe[col_name]>up))].index
        return outlier_index

In [5]:
df=load()
df.shape

(891, 12)

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
##DEgisken isimlerini büyültelim
df.columns=[col.upper() for col in df.columns]

## 1-) Feature Engineering

In [11]:
# Cabin bool
df["NEW_CABIN_BOOL"] = df["CABIN"].notnull().astype('int')
# Name count
df["NEW_NAME_COUNT"] = df["NAME"].str.len()
# name word count
df["NEW_NAME_WORD_COUNT"] = df["NAME"].apply(lambda x: len(str(x).split(" ")))
# name dr
df["NEW_NAME_DR"] = df["NAME"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
# name title
df['NEW_TITLE'] = df.NAME.str.extract(' ([A-Za-z]+)\.', expand=False)
# family size
df["NEW_FAMILY_SIZE"] = df["SIBSP"] + df["PARCH"] + 1
# age_pclass
df["NEW_AGE_PCLASS"] = df["AGE"] * df["PCLASS"]
# is alone
df.loc[((df['SIBSP'] + df['PARCH']) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df['SIBSP'] + df['PARCH']) == 0), "NEW_IS_ALONE"] = "YES"
# age level
df.loc[(df['AGE'] < 18), 'NEW_AGE_CAT'] = 'young'
df.loc[(df['AGE'] >= 18) & (df['AGE'] < 56), 'NEW_AGE_CAT'] = 'mature'
df.loc[(df['AGE'] >= 56), 'NEW_AGE_CAT'] = 'senior'
# sex x age
df.loc[(df['SEX'] == 'MALE') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'
df.loc[(df['SEX'] == 'MALE') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturemale'
df.loc[(df['SEX'] == 'MALE') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['SEX'] == 'FEMALE') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['SEX'] == 'FEMALE') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['SEX'] == 'FEMALE') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'

In [12]:
df.head()

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED,NEW_CABIN_BOOL,NEW_NAME_COUNT,NEW_NAME_WORD_COUNT,NEW_NAME_DR,NEW_TITLE,NEW_FAMILY_SIZE,NEW_AGE_PCLASS,NEW_IS_ALONE,NEW_AGE_CAT,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,23,4,0,Mr,2,66.0,NO,mature,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,1,51,7,0,Mrs,2,38.0,NO,mature,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,22,3,0,Miss,1,78.0,YES,mature,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,44,7,0,Mrs,2,35.0,NO,mature,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,24,4,0,Mr,1,105.0,YES,mature,


In [13]:
df.shape

(891, 22)

In [14]:
cat_cols,num_cols,cat_but_car=grab_col_names(df)

Observations(Gözlem):891
Variables(Öznitelik): 22
cat_cols:14
num_cols:5 
cat_but_car:3 
num_but_cat:8 


In [15]:
cat_cols

['SEX',
 'EMBARKED',
 'NEW_TITLE',
 'NEW_IS_ALONE',
 'NEW_AGE_CAT',
 'NEW_SEX_CAT',
 'SURVIVED',
 'PCLASS',
 'SIBSP',
 'PARCH',
 'NEW_CABIN_BOOL',
 'NEW_NAME_WORD_COUNT',
 'NEW_NAME_DR',
 'NEW_FAMILY_SIZE']

In [16]:
num_cols

['PASSENGERID', 'AGE', 'FARE', 'NEW_NAME_COUNT', 'NEW_AGE_PCLASS']

In [17]:
num_cols=[col for col in num_cols if "PASSENGERID" not in col]

In [18]:
num_cols

['AGE', 'FARE', 'NEW_NAME_COUNT', 'NEW_AGE_PCLASS']

## 2. Outliers (Aykırı Değerler)


In [19]:
for col in num_cols:
    print(col,check_outlier(df))

NameError: name 'check_outlier' is not defined