In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from scipy import stats


In [4]:
# Load the Titanic dataset
file_path = 'titanic.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Original DataFrame:")
print(df.head())


Original DataFrame:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500

In [14]:
# 1. Data Cleaning: Removing duplicates and handling outliers
df_cleaned = df.drop_duplicates().copy()  # Removing duplicate rows and making a copy
df_cleaned

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [18]:
# Removing outliers using z-score method for numeric columns only
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns

# Compute z-scores and handle NaN values
z_scores = np.abs(stats.zscore(df_cleaned[numeric_cols], nan_policy='omit'))
df_cleaned = df_cleaned[(z_scores < 3).all(axis=1)].copy()  # Ensure we're working with a copy

# Correcting data types if needed (example: 'Fare' should be float)
# Using .loc[] to avoid SettingWithCopyWarning
df_cleaned.loc[:, 'Fare'] = df_cleaned['Fare'].astype(float)

df_cleaned

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [23]:
# 2. Transformation: Log and square root transformation
# Avoid transforming NaN values to prevent transformation errors
df_cleaned['Fare_log'] = np.log1p(df_cleaned['Fare'].fillna(0))  # Log transform (log(1 + x) to handle zero values)
df_cleaned['Fare_sqrt'] = np.sqrt(df_cleaned['Fare'].fillna(0))  # Square root transform

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_log,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2.110213,2.692582,0.07754,0.305752,-0.686696,-0.686564
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4.280593,8.442944,0.762388,0.532445,2.277902,0.50679
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2.188856,2.815138,0.084759,0.362426,-0.655445,-0.388225
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3.990834,7.286975,0.567914,0.48994,1.436056,0.283036
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2.202765,2.837252,0.086096,0.48994,-0.649658,0.283036


In [24]:
# 3. Normalization: Scaling data between 0 and 1
# Handle missing values before applying MinMaxScaler
scaler = MinMaxScaler()
# Only normalize if there are non-missing values
if df_cleaned[['Fare', 'Age']].dropna().shape[0] > 0:
    df_cleaned[['Fare_normalized', 'Age_normalized']] = scaler.fit_transform(df_cleaned[['Fare', 'Age']].fillna(0))

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_log,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2.110213,2.692582,0.07754,0.305752,-0.686696,-0.686564
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4.280593,8.442944,0.762388,0.532445,2.277902,0.50679
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2.188856,2.815138,0.084759,0.362426,-0.655445,-0.388225
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3.990834,7.286975,0.567914,0.48994,1.436056,0.283036
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2.202765,2.837252,0.086096,0.48994,-0.649658,0.283036


In [25]:
# 4. Standardization (Scaling): Standardizing features to have zero mean and unit variance
standardizer = StandardScaler()
# Only standardize if there are non-missing values
if df_cleaned[['Fare', 'Age']].dropna().shape[0] > 0:
    df_cleaned[['Fare_standardized', 'Age_standardized']] = standardizer.fit_transform(df_cleaned[['Fare', 'Age']].fillna(0))


In [27]:
# 5. Encoding: Converting categorical data into numerical values
# One-Hot Encoding for 'Embarked' column
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Only encode if there are non-missing values
if df_cleaned[['Embarked']].dropna().shape[0] > 0:
    embarked_encoded = onehot_encoder.fit_transform(df_cleaned[['Embarked']].fillna('missing'))
    embarked_encoded_df = pd.DataFrame(embarked_encoded, columns=onehot_encoder.get_feature_names_out(['Embarked']))
    df_encoded = pd.concat([df_cleaned, embarked_encoded_df], axis=1)
else:
    df_encoded = df_cleaned.copy()

# Label Encoding for 'Sex' column
label_encoder = LabelEncoder()
# Only encode if there are non-missing values
if df_encoded[['Sex']].dropna().shape[0] > 0:
    df_encoded['Sex_encoded'] = label_encoder.fit_transform(df_encoded['Sex'].fillna('missing'))

df_encoded.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing,Sex_encoded
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,...,2.692582,0.07754,0.305752,-0.686696,-0.686564,0.0,0.0,1.0,0.0,1
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,...,8.442944,0.762388,0.532445,2.277902,0.50679,1.0,0.0,0.0,0.0,0
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,...,2.815138,0.084759,0.362426,-0.655445,-0.388225,0.0,0.0,1.0,0.0,0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,...,7.286975,0.567914,0.48994,1.436056,0.283036,0.0,0.0,1.0,0.0,0
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,...,2.837252,0.086096,0.48994,-0.649658,0.283036,0.0,0.0,1.0,0.0,1


In [29]:
# 6. Imputation: Filling missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = df_encoded.copy()
# Impute only if columns exist
if 'Age' in df_imputed.columns and 'Fare' in df_imputed.columns:
    df_imputed[['Age', 'Fare']] = imputer.fit_transform(df_imputed[['Age', 'Fare']])

df_imputed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing,Sex_encoded
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,...,2.692582,0.07754,0.305752,-0.686696,-0.686564,0.0,0.0,1.0,0.0,1
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,...,8.442944,0.762388,0.532445,2.277902,0.50679,1.0,0.0,0.0,0.0,0
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,...,2.815138,0.084759,0.362426,-0.655445,-0.388225,0.0,0.0,1.0,0.0,0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,...,7.286975,0.567914,0.48994,1.436056,0.283036,0.0,0.0,1.0,0.0,0
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,...,2.837252,0.086096,0.48994,-0.649658,0.283036,0.0,0.0,1.0,0.0,1


In [31]:
# 7. Handling Missing Data: Dropping rows or columns with missing data
df_dropped = df_encoded.dropna()  # Drop rows with any missing values
df_dropped.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing,Sex_encoded
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,...,8.442944,0.762388,0.532445,2.277902,0.50679,1.0,0.0,0.0,0.0,0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,...,7.286975,0.567914,0.48994,1.436056,0.283036,0.0,0.0,1.0,0.0,0
6,7.0,0.0,1.0,"McCarthy, Mr. Timothy J",male,54.0,0.0,0.0,17463,51.8625,...,7.201562,0.554679,0.759139,1.378763,1.700143,1.0,0.0,0.0,0.0,1
10,11.0,1.0,3.0,"Sandstrom, Miss. Marguerite Rut",female,4.0,1.0,1.0,PP 9549,16.7,...,4.086563,0.17861,0.050723,-0.249182,-2.029086,0.0,0.0,1.0,0.0,0
11,12.0,1.0,1.0,"Bonnell, Miss. Elizabeth",female,58.0,0.0,0.0,113783,26.55,...,5.152669,0.283957,0.815812,0.206851,1.998481,0.0,0.0,1.0,0.0,0


In [33]:
# 8. Dimensionality Reduction: Applying PCA
# PCA requires numeric columns and no missing values
df_pca = df_imputed.select_dtypes(include=[np.number]).dropna()
if df_pca.shape[1] > 1:
    pca = PCA(n_components=2)
    df_pca_reduced = pd.DataFrame(pca.fit_transform(df_pca), columns=['PCA1', 'PCA2'])
else:
    df_pca_reduced = pd.DataFrame()  # Create an empty DataFrame if PCA cannot be applied

df_pca_reduced.head()

Unnamed: 0,PCA1,PCA2
0,-280.788697,-16.598003
1,-279.647117,49.727938
2,-278.7694,-15.148067
3,-277.679821,31.257411
4,-276.728248,-13.285725


In [34]:
# 9. Data Integration: Combining datasets (here we'll merge the DataFrame with itself for illustration)
df_combined = pd.concat([df_encoded, df_encoded], axis=0)
df_combined.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing,Sex_encoded
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,...,2.692582,0.07754,0.305752,-0.686696,-0.686564,0.0,0.0,1.0,0.0,1
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,...,8.442944,0.762388,0.532445,2.277902,0.50679,1.0,0.0,0.0,0.0,0
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,...,2.815138,0.084759,0.362426,-0.655445,-0.388225,0.0,0.0,1.0,0.0,0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,...,7.286975,0.567914,0.48994,1.436056,0.283036,0.0,0.0,1.0,0.0,0
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,...,2.837252,0.086096,0.48994,-0.649658,0.283036,0.0,0.0,1.0,0.0,1


In [35]:
# 10. Sampling: Random sampling of the dataset
# Ensure we do not sample more rows than available
sample_size = min(100, df_encoded.shape[0])
df_sampled = df_encoded.sample(n=sample_size)  # Randomly sample 100 rows or less if fewer rows exist

df_sampled.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_sqrt,Fare_normalized,Age_normalized,Fare_standardized,Age_standardized,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing,Sex_encoded
532,533.0,0.0,3.0,"Elias, Mr. Joseph Jr",male,17.0,1.0,1.0,2690.0,7.2292,...,2.688717,0.077318,0.234911,-0.687659,-1.059486,0.0,0.0,1.0,0.0,1
387,388.0,1.0,2.0,"Buss, Miss. Kate",female,36.0,0.0,0.0,27849.0,13.0,...,3.605551,0.139037,0.504109,-0.420484,0.35762,0.0,1.0,0.0,0.0,0
542,,,,,,,,,,,...,,,,,,0.0,0.0,1.0,0.0,2
362,363.0,0.0,3.0,"Barbara, Mrs. (Catherine David)",female,45.0,0.0,1.0,2691.0,14.4542,...,3.801868,0.15459,0.631624,-0.353158,1.028882,0.0,0.0,1.0,0.0,0
366,367.0,1.0,1.0,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60.0,1.0,0.0,110813.0,75.25,...,8.674676,0.804813,0.844148,2.461552,2.14765,1.0,0.0,0.0,0.0,0


In [37]:
# Outputs presented together to see the results of the transformations
print("\nCleaned DataFrame:")
print(df_cleaned.head())
print("\nEncoded DataFrame:")
print(df_encoded.head())
print("\nImputed DataFrame:")
print(df_imputed.head())
print("\nDropped Missing Data DataFrame:")
print(df_dropped.head())
print("\nPCA Reduced DataFrame:")
print(df_pca_reduced.head())
print("\nCombined DataFrame:")
print(df_combined.head())
print("\nSampled DataFrame:")
print(df_sampled.head())



Cleaned DataFrame:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Fare_log  Fare_sqrt  \
0      0         A/5 21171   7.2500   NaN        S  2.110213   2.692582   
1      0          PC 17599  71.2833   C85        C  4.280593   8.442944   
2      0  STON/O2. 3101282   7.9250   NaN        S  2.188856   2.815138   
