In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
import numpy as np

In [2]:
df = pd.read_csv("../data/Titanic-Dataset.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


برای پیدا کردن مقادیر گمشده در سن، اول باید ستون های جنسیت به ستون عددی تبدیل شوند

In [4]:
# One Hot Encoding For Sex

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
new_df = pd.DataFrame(one_hot_encoder.fit_transform(df[['Sex']]), columns=one_hot_encoder.get_feature_names_out(['Sex']))

new_df

Unnamed: 0,Sex_female,Sex_male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
886,0.0,1.0
887,1.0,0.0
888,1.0,0.0
889,0.0,1.0


حذف ستون های اضافه + اضافه کردن ستون های جدید

In [5]:
df.drop(columns=['PassengerId', 'Name', 'Cabin', "Ticket", "Sex"], inplace=True)

df = pd.concat([df, new_df], axis=1)

df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,S,0.0,1.0
1,1,1,38.0,1,0,71.2833,C,1.0,0.0
2,1,3,26.0,0,0,7.925,S,1.0,0.0
3,1,1,35.0,1,0,53.1,S,1.0,0.0
4,0,3,35.0,0,0,8.05,S,0.0,1.0


جایگزین کردن مقادیر گمشده در ستون Embark با گزینه ای که بیشترین تکرار رو داشته 




In [6]:
df[df['Embarked'].isna() == True]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
61,1,1,38.0,0,0,80.0,,1.0,0.0
829,1,1,62.0,0,0,80.0,,1.0,0.0


In [7]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [8]:
df['Embarked'].fillna('S', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)


In [9]:
df.Embarked.isna().sum()

np.int64(0)

لیبل کردن مقادیر ستون Embarked

In [10]:
label_encoder = LabelEncoder()

df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

In [11]:
df.Embarked.unique()

array([2, 0, 1])

جایگزین مقادیر گمشده در سن با استفاده از KNNimputer

In [12]:
imputer = KNNImputer(n_neighbors=3)
df[['Age', 'Pclass', 'Survived', 'Sex_female', 'Sex_male']] = imputer.fit_transform(df[['Age', 'Pclass', 'Survived', 'Sex_female', 'Sex_male']])

In [13]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0.0,3.0,22.0,1,0,7.25,2,0.0,1.0
1,1.0,1.0,38.0,1,0,71.2833,0,1.0,0.0
2,1.0,3.0,26.0,0,0,7.925,2,1.0,0.0
3,1.0,1.0,35.0,1,0,53.1,2,1.0,0.0
4,0.0,3.0,35.0,0,0,8.05,2,0.0,1.0


In [14]:
df.isna().sum()

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Sex_female    0
Sex_male      0
dtype: int64

In [15]:
df[['Age', 'Sex_female', 'Sex_male']] = df[['Age', 'Sex_female', 'Sex_male']].astype(int)

In [16]:
df.rename(columns={"Sex_female": "Female", "Sex_male": "Male"}, inplace=True)

In [17]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Female,Male
0,0.0,3.0,22,1,0,7.25,2,0,1
1,1.0,1.0,38,1,0,71.2833,0,1,0
2,1.0,3.0,26,0,0,7.925,2,1,0
3,1.0,1.0,35,1,0,53.1,2,1,0
4,0.0,3.0,35,0,0,8.05,2,0,1


In [18]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

scaler = StandardScaler()
scaled = scaler.fit_transform(X)

In [19]:
external = pd.DataFrame(scaled, columns=df.columns.tolist()[1:])

In [20]:
external['target'] = y

In [21]:
external

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Female,Male,target
0,0.827377,-0.512884,0.432793,-0.473674,-0.502445,0.585954,-0.737695,0.737695,0.0
1,-1.566107,0.653066,0.432793,-0.473674,0.786845,-1.942303,1.355574,-1.355574,1.0
2,0.827377,-0.221396,-0.474545,-0.473674,-0.488854,0.585954,1.355574,-1.355574,1.0
3,-1.566107,0.434451,0.432793,-0.473674,0.420730,0.585954,1.355574,-1.355574,1.0
4,0.827377,0.434451,-0.474545,-0.473674,-0.486337,0.585954,-0.737695,0.737695,0.0
...,...,...,...,...,...,...,...,...,...
886,-0.369365,-0.148525,-0.474545,-0.473674,-0.386671,0.585954,-0.737695,0.737695,0.0
887,-1.566107,-0.731500,-0.474545,-0.473674,-0.044381,0.585954,1.355574,-1.355574,1.0
888,0.827377,-0.512884,0.432793,2.008933,-0.176263,0.585954,1.355574,-1.355574,0.0
889,-1.566107,-0.221396,-0.474545,-0.473674,-0.044381,-1.942303,-0.737695,0.737695,1.0


In [22]:
external.to_csv("../data/external.csv", index=False)