In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('titanic.csv')

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [5]:
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100
print(missing_percent)

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64


In [6]:
for column in df.select_dtypes(include=['float64', 'int64']):
    if missing_percent[column] < 10:
        df[column] = df[column].fillna(df[column].mean())
        print(column," : ",missing_percent[column])

PassengerId  :  0.0
Survived  :  0.0
Pclass  :  0.0
SibSp  :  0.0
Parch  :  0.0
Fare  :  0.23923444976076555


In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
print("Mode imputation for a categorical data :")
for column in df.select_dtypes(include=['object']):
    if missing_percent[column] > 0:
        mode_value = df[column].mode()[0]
        df[column] = df[column].fillna(mode_value)
        print(column)

Mode imputation for a categorical data :
Cabin


In [9]:
from sklearn.impute import KNNImputer

In [10]:
imputer = KNNImputer(n_neighbors=5)
df_numeric = df.select_dtypes(include=['float64', 'int64'])
imputed_data = imputer.fit_transform(df_numeric)
df[df_numeric.columns] = imputed_data
print(df_numeric.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [11]:
columns_to_drop = [column for column in df.columns if missing_percent[column] > 10]
df_dropped_columns = df.drop(columns=columns_to_drop)
print("\nData size after dropping columns with > 10% missing values:", df_dropped_columns.shape)


Data size after dropping columns with > 10% missing values: (418, 10)


In [12]:
from scipy.stats import zscore

In [13]:
z_scores = np.abs(zscore(df_numeric))
df_no_outliers = df[(z_scores >3).all(axis=1)]
print("\nData size after dropping rows with Z-score > 3:", df_no_outliers.shape)


Data size after dropping rows with Z-score > 3: (0, 12)


In [14]:
threshold = len(df.columns) // 2
duplicates = df.apply(lambda x: x.duplicated(keep=False)).sum(axis=1) > threshold
df_no_duplicates = df[~duplicates]
print("\nData size after dropping duplicate rows based on 50% similarity:", df_no_duplicates.shape)


Data size after dropping duplicate rows based on 50% similarity: (3, 12)


In [15]:
from sklearn.preprocessing import MinMaxScaler, Binarizer, OneHotEncoder


In [16]:
scaler = MinMaxScaler()
df_numeric_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)
print("\nData after Min-Max Normalization:\n", df_numeric_scaled.head())


Data after Min-Max Normalization:
    PassengerId  Survived  Pclass       Age  SibSp     Parch      Fare
0     0.000000       0.0     1.0  0.452723  0.000  0.000000  0.015282
1     0.002398       1.0     1.0  0.617566  0.125  0.000000  0.013663
2     0.004796       0.0     0.5  0.815377  0.000  0.000000  0.018909
3     0.007194       0.0     1.0  0.353818  0.000  0.000000  0.016908
4     0.009592       1.0     1.0  0.287881  0.125  0.111111  0.023984


In [24]:
'''from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Name'] = encoder.fit_transform(df['Name'])
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Ticket'] = encoder.fit_transform(df['Ticket'])
df['Cabin'] = encoder.fit_transform(df['Cabin'])
df['Embarked'] = encoder.fit_transform(df['Embarked'])

df_dropped = df.dropna()'''

In [25]:
df_scaled = pd.DataFrame(scaler.fit_transform(df_dropped), columns=df_dropped.columns)

In [26]:
binarizer = Binarizer(threshold=0.5)
binarized_data = pd.DataFrame(binarizer.fit_transform(df_scaled), columns=df_scaled.columns)
print("\nBinarized data:\n", binarized_data)


Binarized data:
      PassengerId  Survived  Pclass  Name  Sex  Age  SibSp  Parch  Ticket  \
0            0.0       0.0     1.0   0.0  1.0  0.0    0.0    0.0     0.0   
1            0.0       1.0     1.0   1.0  0.0  1.0    0.0    0.0     1.0   
2            0.0       0.0     0.0   1.0  1.0  1.0    0.0    0.0     0.0   
3            0.0       0.0     1.0   1.0  1.0  0.0    0.0    0.0     0.0   
4            0.0       1.0     1.0   0.0  0.0  0.0    0.0    0.0     0.0   
..           ...       ...     ...   ...  ...  ...    ...    ...     ...   
413          1.0       0.0     1.0   1.0  1.0  0.0    0.0    0.0     1.0   
414          1.0       1.0     0.0   1.0  0.0  1.0    0.0    0.0     1.0   
415          1.0       0.0     1.0   1.0  1.0  1.0    0.0    0.0     1.0   
416          1.0       0.0     1.0   1.0  1.0  0.0    0.0    0.0     1.0   
417          1.0       0.0     1.0   1.0  1.0  0.0    0.0    0.0     0.0   

     Fare  Cabin  Embarked  
0     0.0    0.0       0.0  
1     0.0  

In [23]:
encoder = OneHotEncoder(sparse=False)
df_categorical = df.select_dtypes(include=['object'])
encoded_data = encoder.fit_transform(df_categorical)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df_categorical.columns))

print(encoded_df)

     Name_Abbott, Master. Eugene Joseph  Name_Abelseth, Miss. Karen Marie  \
0                                   0.0                               0.0   
1                                   0.0                               0.0   
2                                   0.0                               0.0   
3                                   0.0                               0.0   
4                                   0.0                               0.0   
..                                  ...                               ...   
413                                 0.0                               0.0   
414                                 0.0                               0.0   
415                                 0.0                               0.0   
416                                 0.0                               0.0   
417                                 0.0                               0.0   

     Name_Abelseth, Mr. Olaus Jorgensen  \
0                               

