In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

In [6]:
# Load Titanic dataset from GitHub
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Display first 5 rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Define numerical & categorical columns
num_features = ['Age', 'Fare']
cat_features = ['Sex', 'Embarked']

In [8]:
# Pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [9]:
# Apply transformation
processed_data = preprocessor.fit_transform(df)

# Convert to DataFrame
processed_df = pd.DataFrame(processed_data)

# Show first 5 rows of processed data
processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.592481,-0.502445,0.0,1.0,0.0,0.0,1.0
1,0.638789,0.786845,1.0,0.0,1.0,0.0,0.0
2,-0.284663,-0.488854,1.0,0.0,0.0,0.0,1.0
3,0.407926,0.42073,1.0,0.0,0.0,0.0,1.0
4,0.407926,-0.486337,0.0,1.0,0.0,0.0,1.0


In [10]:
# Get current folder where .ipynb is saved
current_folder = os.getcwd()
output_file = os.path.join(current_folder, "processed_data.csv")

# Save processed data
processed_df.to_csv(output_file, index=False)

print(f"✅ Processed data saved at: {output_file}")

✅ Processed data saved at: c:\Users\DELL\Desktop\elitetech internship\processed_data.csv
