In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler,RobustScaler

In [2]:
def grab_col_names(dataframe,cat_th=10,car_th=20):
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtypes=="O"] #tipi obcejt olanları al
    num_but_cat=[col for col in dataframe.columns if dataframe[col].nunique()<cat_th and
             dataframe[col].dtypes!="O"] #numerik görünen kategörikleri aldık
    cat_but_car=[col for col in dataframe.columns if dataframe[col].nunique()>car_th and
                dataframe[col].dtypes=="O"] #kategorik görünen kardinalleri aldık.
    
    cat_cols=cat_cols+num_but_cat #kategörikler güncellendi
    cat_cols=[col for col in cat_cols if col not in cat_but_car] #cat_cols içinde kardinalleri ayırdık
    
    num_cols=[col for col in dataframe.columns if dataframe[col].dtypes!="O"] #tipi objectden farkı olanları getir
    num_cols=[col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations(Gözlem):{dataframe.shape[0]}")
    print(f"Variables(Öznitelik): {dataframe.shape[1]}")
    print(f"cat_cols:{len(cat_cols)}")
    print(f"num_cols:{len(num_cols)} ")
    print(f"cat_but_car:{len(cat_but_car)} ")
    print(f"num_but_cat:{len(num_but_cat)} ")
    
    return cat_cols,num_cols,cat_but_car


In [3]:
def load():
    data=pd.read_csv("titanic.csv")
    return data

In [4]:
df=load()

In [5]:
cat_cols,num_cols,cat_but_car=grab_col_names(df)

Observations(Gözlem):891
Variables(Öznitelik): 12
cat_cols:6
num_cols:3 
cat_but_car:3 
num_but_cat:4 


In [6]:
num_cols

['PassengerId', 'Age', 'Fare']

In [8]:
num_cols=[col for col in num_cols if col not in "PassengerId"]

In [17]:
dff=pd.get_dummies(df[cat_cols+num_cols],drop_first=True) 

In [18]:
dff.head() #kategorikleri nümerik hale getirdik

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,1,0,22.0,7.25,True,False,True
1,1,1,1,0,38.0,71.2833,False,False,False
2,1,3,0,0,26.0,7.925,False,False,True
3,1,1,1,0,35.0,53.1,False,False,True
4,0,3,0,0,35.0,8.05,True,False,True


In [19]:
scaler=MinMaxScaler() #degerleri 0-1 arasına dönüştür
dff=pd.DataFrame(scaler.fit_transform(dff),columns=dff.columns) #eğit
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271174,0.014151,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472229,0.139136,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321438,0.015469,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.434531,0.103644,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.434531,0.015713,1.0,0.0,1.0


In [20]:
from sklearn.impute import KNNImputer

In [21]:
dff.isnull().sum()

Survived        0
Pclass          0
SibSp           0
Parch           0
Age           177
Fare            0
Sex_male        0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [22]:
imputer = KNNImputer(n_neighbors=5)
dff=pd.DataFrame(imputer.fit_transform(dff),columns=dff.columns)
dff.head()
#boş değerlerin 5 tane komşusuna bak, onun değerlerinin oralamasını al ve boş olan degere ata

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271174,0.014151,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472229,0.139136,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321438,0.015469,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.434531,0.103644,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.434531,0.015713,1.0,0.0,1.0


In [24]:
dff=pd.DataFrame(scaler.inverse_transform(dff),columns=dff.columns) #Standartlaşmış veriyi eski haline getireceğiz

In [25]:
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,3.0,1.0,0.0,22.0,7.25,1.0,0.0,1.0
1,1.0,1.0,1.0,0.0,38.0,71.2833,0.0,0.0,0.0
2,1.0,3.0,0.0,0.0,26.0,7.925,0.0,0.0,1.0
3,1.0,1.0,1.0,0.0,35.0,53.1,0.0,0.0,1.0
4,0.0,3.0,0.0,0.0,35.0,8.05,1.0,0.0,1.0


In [26]:
df["age_imputed_knn"]=dff[["Age"]]

In [27]:
df.loc[df["Age"].isnull(),["Age","age_imputed_knn"]]

Unnamed: 0,Age,age_imputed_knn
5,,47.8
17,,37.6
19,,12.2
26,,32.8
28,,17.6
...,...,...
859,,25.8
863,,8.8
868,,25.0
878,,24.4


In [28]:
df.loc[df["Age"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_imputed_knn
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,47.8
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,37.6
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,12.2
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,32.8
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,25.8
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,8.8
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,25.0
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,24.4


Ne yaptık
1-)Eksik deger tablosu fonksiyon
2-)Nümerikleri median ile doldurma 
3-)Kategorikleri mode ile doldurma
4-)Kagegorikleri degisken kırılımına göre doldurma
5-)Tahmine dayalı doldurma
