In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [2]:
def load():
    data=pd.read_csv("titanic.csv")
    return data

In [3]:
df=load()

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def chech_outlier(dataframe,col_name):
    low_limit,up_limit=outlier_thresholds(dataframe,col_name)
    if dataframe[(dataframe[col_name]>up_limit) | (dataframe[col_name]<low_limit)].any(axis=None):
        return True
    else:
        return False

def grab_col_names(dataframe,cat_th=10,car_th=20):
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtypes=="O"] #tipi obcejt olanları al
    num_but_cat=[col for col in dataframe.columns if dataframe[col].nunique()<cat_th and
             dataframe[col].dtypes!="O"] #numerik görünen kategörikleri aldık
    cat_but_car=[col for col in dataframe.columns if dataframe[col].nunique()>car_th and
                dataframe[col].dtypes=="O"] #kategorik görünen kardinalleri aldık.

    cat_cols=cat_cols+num_but_cat #kategörikler güncellendi
    cat_cols=[col for col in cat_cols if col not in cat_but_car] #cat_cols içinde kardinalleri ayırdık

    num_cols=[col for col in dataframe.columns if dataframe[col].dtypes!="O"] #tipi objectden farkı olanları getir
    num_cols=[col for col in num_cols if col not in num_but_cat]

    print(f"Observations(Gözlem):{dataframe.shape[0]}")
    print(f"Variables(Öznitelik): {dataframe.shape[1]}")
    print(f"cat_cols:{len(cat_cols)}")
    print(f"num_cols:{len(num_cols)} ")
    print(f"cat_but_car:{len(cat_but_car)} ")
    print(f"num_but_cat:{len(num_but_cat)} ")

    return cat_cols,num_cols,cat_but_car

def grab_outliers(dataframe,col_name,index=False):
    low,up=outlier_thresholds(dataframe,col_name)

    if dataframe[((dataframe[col_name]<low) | (dataframe[col_name]>up))].shape[0]>10:
        print(dataframe[((dataframe[col_name]<low)) | (dataframe[col_name]>up)].head())
    else:
        print(dataframe[((dataframe[col_name]<low)) | (dataframe[col_name]>up)])

    if index:
        outlier_index=dataframe[((dataframe[col_name]<low)| (dataframe[col_name]>up))].index
        return outlier_index

In [6]:
low,up=outlier_thresholds(df,"Fare")

In [7]:
df.shape

(891, 12)

In [8]:
df[~((df["Fare"]<low)| (df["Fare"]>up))].shape   #Aykırı degerlerden kurtulduk.

(775, 12)

## silme yöntemi

In [9]:
def remove_outlier(dataframe,col_name):
    low_limit,up_limit=outlier_thresholds(dataframe,col_name)
    dataframe=dataframe[~((dataframe[col_name]<low_limit) | (dataframe[col_name]>up_limit))]
    return dataframe

In [10]:
cat_cols,num_cols,cat_but_car=grab_col_names(df)

Observations(Gözlem):891
Variables(Öznitelik): 12
cat_cols:6
num_cols:3 
cat_but_car:3 
num_but_cat:4 


In [11]:
num_cols

['PassengerId', 'Age', 'Fare']

In [12]:
num_cols=[col for col in num_cols if col not in "PassengerId"]

In [13]:
num_cols

['Age', 'Fare']

In [14]:
for col in num_cols:
    new_df=remove_outlier(df,col)

df.shape[0]-new_df.shape[0] ##116 aykırı deger silindi

## baskılama yöntemi
Silme yöntemi veri kaybetmemize neden olur. Baskılamada aykırı degerler esik degerlerle degistirilir.

In [18]:
df[~((df["Fare"]<low)| (df["Fare"]>up))]["Fare"]

0       7.2500
2       7.9250
3      53.1000
4       8.0500
5       8.4583
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 775, dtype: float64

In [19]:
df.loc[~((df["Fare"]<low)| (df["Fare"]>up)),"Fare"]

0       7.2500
2       7.9250
3      53.1000
4       8.0500
5       8.4583
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 775, dtype: float64

In [23]:
df.loc[(df["Fare"]>up),"Fare"] = up #updan büyük olanları atadık
df.loc[(df["Fare"]<low),"Fare"] = low 

In [24]:
df.loc[(df["Fare"]>up),"Fare"] 

Series([], Name: Fare, dtype: float64)

In [25]:
def replace_with_threshlods(dataframe,veriable):
    low_limit,up_limit=outlier_thresholds(dataframe,veriable)
    dataframe.loc[(dataframe[veriable]>up_limit),veriable]=up_limit
    dataframe.loc[(dataframe[veriable]>low_limit),veriable]=low_limit

In [26]:
df=load()

In [27]:
df.shape

(891, 12)

In [29]:
cat_cols,num_cols,cat_but_car=grab_col_names(df)
num_cols=[col for col in num_cols if col not in "PassengerId"]

Observations(Gözlem):891
Variables(Öznitelik): 12
cat_cols:6
num_cols:3 
cat_but_car:3 
num_but_cat:4 


In [31]:
for col in num_cols:
    print(col,chech_outlier(df,col))

Age True
Fare True


In [32]:
for col in num_cols:
    replace_with_threshlods(df,col)

In [33]:
for col in num_cols:
    print(col,chech_outlier(df,col))

Age False
Fare False


1-)Aykırı deger saptama  **
2-)Aykırı deger var mı kontrolü  
3-)Aykırı degerleri al

1-)Aykırı degerleri sil
2-)Aykırı degerleri baskıla **