In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("Lab2_titanic.csv")
# Show shape and missing values and percentages
print('Shape:', df.shape)
print('Missing values per column:')
# print(df.isnull().sum())
for i in range(df.shape[1]):
    print(f"col{i} = {df.isnull().sum()[i]}\t Percentage = {df.isnull().sum()[i]/df.shape[0]*100}\n")



# Option 1: Drop rows with missing values
df_dropna = df.dropna() # 0 to drop row and 1 to drop the whole column
print('Shape after dropna:', df_dropna.shape)

# Option 2: Fill missing values (example: fill with mean for numeric columns)
df_fillna = df.fillna(df.mean(numeric_only=True))
print('Any missing after fillna:', df_fillna.isnull().sum().sum())

# Option 3: Fill then drop 
df_fillna_dropna = df_fillna.dropna()
print('Shape after filling and droping:', df_fillna_dropna.shape)

Shape: (891, 12)
Missing values per column:
col0 = 0	 Percentage = 0.0

col1 = 0	 Percentage = 0.0

col2 = 0	 Percentage = 0.0

col3 = 0	 Percentage = 0.0

col4 = 0	 Percentage = 0.0

col5 = 177	 Percentage = 19.865319865319865

col6 = 0	 Percentage = 0.0

col7 = 0	 Percentage = 0.0

col8 = 0	 Percentage = 0.0

col9 = 0	 Percentage = 0.0

col10 = 687	 Percentage = 77.10437710437711

col11 = 2	 Percentage = 0.22446689113355783

Shape after dropna: (183, 12)
Any missing after fillna: 689
Shape after filling and droping: (202, 12)


  print(f"col{i} = {df.isnull().sum()[i]}\t Percentage = {df.isnull().sum()[i]/df.shape[0]*100}\n")


In [2]:
# Label Encoding and One-Hot Encoding of categorical variables
df = df_fillna_dropna.copy()
# print('Total number of missing values:')
# print(df.isnull().sum().sum())

# label encoding for 'Sex' column
if 'Sex' in df.columns:
    unique_labels = df['Sex'].dropna().unique()
    print("unique labels = ", unique_labels)

    label_map = {label: idx for idx, label in enumerate(unique_labels)}
    print('Label map:', label_map)

    df['Sex_label'] = df['Sex'].map(label_map)
    print('Label encoded Sex:\n', df[['Sex', 'Sex_label']].head())

    # one-hot encoding
    for label in unique_labels:
        df[f'Sex_{label}'] = (df['Sex'] == label).astype(int)
    print('One-hot encoded columns\n:', df[['Sex','Sex_male', 'Sex_female']].head())

unique labels =  ['female' 'male']
Label map: {'female': 0, 'male': 1}
Label encoded Sex:
        Sex  Sex_label
1   female          0
3   female          0
6     male          1
10  female          0
11  female          0
One-hot encoded columns
:        Sex  Sex_male  Sex_female
1   female         0           1
3   female         0           1
6     male         1           0
10  female         0           1
11  female         0           1


In [3]:
# Feature Scaling: Min-Max Normalization and Z-score Standardization

# Select numeric columns for scaling
numeric_cols = ['Fare']

# Min-Max Normalization
df_minmax = df.copy()
for col in numeric_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    df_minmax[col] = (df[col] - min_val) / (max_val - min_val)
print('Min-Max Normalized sample:')
print(df_minmax[numeric_cols].head())

# Z-score Standardization
df_zscore = df.copy()
for col in numeric_cols:
    mean = df[col].mean()
    std = df[col].std()
    df_zscore[col] = (df[col] - mean) / std
print('Z-score Standardized sample:')
print(df_zscore[numeric_cols].head())

# Similarity and Dissimilarity Measures

# Pearson Correlation (between Age and Fare)
def pearson_cor(x, y):
    x_mean = sum(x)/len(x)
    y_mean = sum(y)/len(y)
    num = sum((a-x_mean)*(b-y_mean) for a, b in zip(x, y))
    denom = (sum((a-x_mean)**2 for a in x) * sum((b-y_mean)**2 for b in y))**0.5
    return num/denom if denom != 0 else 0


def pearson_cor_v2(df, col1, col2):
    df = df[[col1, col2]].dropna()

    n = df.shape[0]
    mean1 = df[col1].mean()
    sd1 = df[col1].std()
    mean2 = df[col2].mean()
    sd2 = df[col2].std()

    











x = df['Age'].tolist()
y = df['Fare'].tolist()
corr = pearson_cor(x, y)
print('Pearson correlation (Age, Fare):', corr)

# Cosine Similarity (between first two rows of numeric data)
def manual_cosine(u, v):
    dot = sum(a*b for a, b in zip(u, v))
    norm_u = sum(a**2 for a in u)**0.5
    norm_v = sum(b**2 for b in v)**0.5
    return dot/(norm_u*norm_v) if norm_u and norm_v else 0

if len(df) > 1:
    v1 = df.iloc[0][numeric_cols].fillna(0).tolist()
    v2 = df.iloc[1][numeric_cols].fillna(0).tolist()
    cos_sim = manual_cosine(v1, v2)
    print('Cosine similarity (row 0, row 1):', cos_sim)


# Euclidean Distance (between first two rows of numeric data)
def manual_euclidean(u, v):
    return sum((a-b)**2 for a, b in zip(u, v))**0.5

if len(df) > 1:
    euc_dist = manual_euclidean(v1, v2)
    print('Euclidean distance (row 0, row 1):', euc_dist)

Min-Max Normalized sample:
        Fare
1   0.139136
3   0.103644
6   0.101229
10  0.032596
11  0.051822
Z-score Standardized sample:
        Fare
1  -0.064473
3  -0.307696
6  -0.324249
10 -0.794587
11 -0.662832
Pearson correlation (Age, Fare): -0.07667999914474721
Cosine similarity (row 0, row 1): 1.0
Euclidean distance (row 0, row 1): 18.183299999999996


  v1 = df.iloc[0][numeric_cols].fillna(0).tolist()
  v2 = df.iloc[1][numeric_cols].fillna(0).tolist()
