# Setup

In this notebook section, we will import the libraries needed to run this code.

In [89]:
!pip install pandas scikit-learn



In [90]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Constants

In this section all constants are defined

In [91]:
numeric_features = ['pclass', 'age', 'sibsp', 'parch', 'fare']
categorical_features = ['sex', 'embarked', 'title']

# Custom Transformers

Define custom transformers in this section

In [127]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        #print(self.attribute_names)
        return self
    def transform(self, X):
        return X[self.attribute_names]

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        #print(self.most_frequent_)
        return self
    def transform(self, X, y=None):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].fillna(self.most_frequent_[col])
        return X

class StandardScalerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.features = X.columns
        return self
    def transform(self, X, y=None):
        standard_scaler = StandardScaler()
        X = X.copy()
        X_transformed = pd.DataFrame(standard_scaler.fit_transform(X[self.features]), columns = self.features)
        return X_transformed

#df_feat_selector = DataFrameSelector(numeric_features)
#df_feat_selector.fit(titanic_data)
#data_pipe1 = df_feat_selector.transform(titanic_data)
#data_pipe1.info()

#mostfrequent_imputer = MostFrequentImputer()
#mostfrequent_imputer.fit(data_pipe1)
#data_pipe2 = mostfrequent_imputer.transform(data_pipe1)
#data_pipe2.info()

#standardscaler_transformer = StandardScalerTransformer()
#standardscaler_transformer.fit(data_pipe2)
#data_pipe3 = standardscaler_transformer.transform(data_pipe2)
#data_pipe3


class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = [variables] if not isinstance(variables, list) else variables
        self.X = pd.DataFrame()

    def fit(self, X, y=None):
        self.X = X
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        self.X = X.copy()
        self.X = pd.concat([self.X, pd.get_dummies(self.X[self.variables], drop_first=True)], axis=1)
        # X.drop([self.variables], axis=1)
        #print(X)

        # Adding missing dummies, if any
        # missing_dummies = [var for var in self.dummies if var not in X.columns]
        # if len(missing_dummies) != 0:
        #    for col in missing_dummies:
        #        X[col] = 0
        return self.X
    
    def droping(self):
        self.X.drop([self.variables], axis = 1)
        return self.X


#print(categorical_features)
#one_encoder = OneHotEncoder(variables=categorical_features)
#one_encoder.fit(titanic_data)
#aux = one_encoder.transform(titanic_data)
#aux

# Pipeline

In [128]:
# Load the dataset

titanic_data = pd.read_csv('raw-data.csv')
# print(titanic_data.head())
# print()
# titanic_data.info()

In [129]:
numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_features)),
    ('imputer', MostFrequentImputer()),
    ('scaler', StandardScalerTransformer())
])

categorical_pipeline = Pipeline([
    ('selector', DataFrameSelector(categorical_features)),
    ('imputer', MostFrequentImputer()),
    ('encoder', OneHotEncoder(categorical_features))
])

In [132]:
# preprocessor = Columntransformer(
preprocessor = Pipeline([
    #transformers=[
        ('numeric', numeric_pipeline, numeric_features),
        ('categorical', categorical_pipeline, categorical_features)
        #('categorical_2', categorical_pipeline.named_steps['encoder'].droping())
    #],
    #remainder='passthrough'  # Allows to preserve remaining columns in the output
])

In [133]:
class SVC_Classifier():
    def __init__(self):
        self.svc_model = SVC(gamma = 'auto')

    def fit(self, X, y):
        return self.svc_model.fit(X = X, y = y)

    def predict(self, X):
        return self.svc_model.predict(X = X)
    
    

In [134]:
pipeline = Pipeline([
    # TODO: Add here the preprocessor and a SVC trainer
    ('preprocessor', preprocessor),
    ('classifier', SVC_Classifier())
])

In [135]:
# Separate features (X) and target (y)
X = titanic_data.drop(['survived', 'cabin'], axis=1)  # Exclude 'cabin' column
y = titanic_data['survived']
print("Features - Target defined.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train - Test splitted.")

# Fit the pipeline on the training data
pipeline.named_steps['preprocessor'].fit(X_train, y_train)
print("Pipeline fitted.")

Features - Target defined.
Train - Test splitted.


ValueError: too many values to unpack (expected 2)

In [99]:
# Transform the training and testing data
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train, flag_drop = True)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test, flag_drop = True)
print("Train - Test datasets transformed.")


TypeError: ColumnTransformer.transform() got an unexpected keyword argument 'flag_drop'

In [87]:
pd.DataFrame(X_train_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.840359,-0.88269,-0.495964,-0.442432,-0.495437,male,S,Mr,1,0,1,0,1,0,0
1,-0.355097,0.58346,-0.495964,-0.442432,-0.445125,male,S,Mr,1,0,1,0,1,0,0
2,-1.550554,-0.805524,-0.495964,1.795376,0.89083,female,S,Miss,0,0,1,1,0,0,0
3,-1.550554,1.432284,0.456833,-0.442432,3.747726,male,C,Other,1,0,0,0,0,0,1
4,-1.550554,-0.342529,-0.495964,-0.442432,0.171172,male,S,Mr,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,0.840359,-0.342529,-0.495964,-0.442432,-0.500588,female,Q,Miss,0,1,0,1,0,0,0
1043,0.840359,-0.805524,-0.495964,-0.442432,-0.497771,female,S,Miss,0,0,1,1,0,0,0
1044,0.840359,0.004717,-0.495964,-0.442432,-0.336935,male,S,Mr,1,0,1,0,1,0,0
1045,0.840359,-0.188198,-0.495964,-0.442432,-0.494873,female,S,Miss,0,0,1,1,0,0,0


In [45]:
# Use the trained classifier to make predictions
pipeline.named_steps['classifier'].fit(X_train_transformed, y_train)
y_pred = pipeline.named_steps['classifier'].predict(X_test_transformed)

# Comparing y_pred with y_test to evaluate the predictions


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: could not convert string to float: 'male'