# Setup

In this notebook section, we will import the libraries needed to run this code.

In [1]:
!pip install pandas scikit-learn



In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Constants

In this section all constants are defined

In [None]:
numeric_features = ['pclass', 'age', 'sibsp', 'parch', 'fare']
categorical_features = ['sex', 'embarked', 'title']

# Custom Transformers

Define custom transformers in this section

In [8]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        #print(self.attribute_names)
        return self
    def transform(self, X):
        return X[self.attribute_names]

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        #print(self.most_frequent_)
        return self
    def transform(self, X, y=None):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].fillna(self.most_frequent_[col])
        return X

class StandardScalerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.features = X.columns
        return self
    def transform(self, X, y=None):
        standard_scaler = StandardScaler()
        X = X.copy()
        X_transformed = pd.DataFrame(standard_scaler.fit_transform(X[self.features]), columns = self.features)
        return X_transformed

#df_feat_selector = DataFrameSelector(numeric_features)
#df_feat_selector.fit(titanic_data)
#data_pipe1 = df_feat_selector.transform(titanic_data)
#data_pipe1.info()

#mostfrequent_imputer = MostFrequentImputer()
#mostfrequent_imputer.fit(data_pipe1)
#data_pipe2 = mostfrequent_imputer.transform(data_pipe1)
#data_pipe2.info()

#standardscaler_transformer = StandardScalerTransformer()
#standardscaler_transformer.fit(data_pipe2)
#data_pipe3 = standardscaler_transformer.transform(data_pipe2)
#data_pipe3


class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], axis=1)
        X.drop(self.variables, axis=1)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X

#print(categorical_features)
#one_encoder = OneHotEncoder(variables=categorical_features)
#one_encoder.fit(titanic_data)
#aux = one_encoder.transform(titanic_data)
#aux

# Pipeline

In [23]:
# Load the dataset

titanic_data = pd.read_csv('raw-data.csv')
# print(titanic_data.head())
# print()
# titanic_data.info()

In [13]:
numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_features)),
    ('imputer', MostFrequentImputer()),
    ('scaler', StandardScalerTransformer())
])

categorical_pipeline = Pipeline([
    ('selector', DataFrameSelector(categorical_features)),
    ('imputer', MostFrequentImputer()),
    ('encoder', OneHotEncoder(categorical_features))
])

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipeline, numeric_features),
        ('categorical', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'  # Allows to preserve remaining columns in the output
)

In [2]:
pipeline = Pipeline([
    # TODO: Add here the preprocessor and a SVC trainer
    ('preprocessor', `ADD HERE YOUR CODE`),
    ('classifier', `ADD HERE YOUR CODE`)
])

# Separate features (X) and target (y)
X = titanic_data.drop(['survived', 'cabin'], axis=1)  # Exclude 'cabin' column
y = titanic_data['survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Transform the training and testing data
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

# Transform the testing data
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

# Use the trained classifier to make predictions
y_pred = pipeline.named_steps['classifier'].predict(X_test_transformed)

# Comparing y_pred with y_test to evaluate the predictions


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

SyntaxError: invalid syntax (2477466498.py, line 39)