<a href="https://colab.research.google.com/github/Gani2324/DATA-PIPELINE-DEVELOPMENT/blob/main/Data_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Step 1: Load Data
def load_data(titanic_csv):
    return pd.read_csv("titanic.csv")

In [None]:
# Step 2: Preprocessing
# Define preprocessing for numeric and categorical features
numeric_features = ['Age', 'Fare']  # Numeric columns in the Titanic dataset
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical columns in the Titanic dataset
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [None]:
# Step 3: Define Pipeline
def create_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

In [None]:
# Step 4: Split Data
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Fit and Evaluate Pipeline
def fit_and_evaluate(pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print(f"Model Score: {score}")

In [None]:
# Example Usage
if __name__ == "__main__":
    # Load dataset
    data = load_data("titanic.csv")

    # Drop unnecessary columns for this example
    data = data.drop(columns=['Name', 'Ticket', 'Cabin'])

    # Split dataset
    X_train, X_test, y_train, y_test = split_data(data, 'Survived')

    # Example: Using a Random Forest model
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(random_state=42)

    # Create pipeline
    pipeline = create_pipeline(model)

    # Fit and evaluate
    fit_and_evaluate(pipeline, X_train, X_test, y_train, y_test)


Model Score: 0.7877094972067039
