In [48]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

np.random.seed(0)

In [49]:
import pandas as pd
data = pd.read_csv(r"C:\Users\mk744\OneDrive - Poornima University\Desktop\Data Files\test.csv")
data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [50]:
data.columns


Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [51]:
data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [52]:
X = data[['Age', 'Fare']]
X

Unnamed: 0,Age,Fare
0,34.5,7.8292
1,47.0,7.0000
2,62.0,9.6875
3,27.0,8.6625
4,22.0,12.2875
...,...,...
413,,8.0500
414,39.0,108.9000
415,38.5,7.2500
416,,8.0500


In [53]:
y = data[['Embarked', 'Sex', 'Pclass']]
y

Unnamed: 0,Embarked,Sex,Pclass
0,Q,male,3
1,S,female,3
2,Q,male,2
3,S,male,3
4,S,female,3
...,...,...,...
413,S,male,3
414,C,female,1
415,S,male,3
416,S,male,3


In [55]:
numeric_transformer = Pipeline(steps=[
     ("num", SimpleImputer(strategy="median")),
     ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown='ignore')),
                                          ("selector", SelectPercentile(chi2, percentile=50))])
preprocessor = ColumnTransformer(
     transformers = [
          ("num", numeric_transformer, X),
          ("cat", categorical_transformer, y)
     ]
)
clf = Pipeline(steps=[
     ("preprocessor", preprocessor),
     ("classfier", LogisticRegression())
])

clf

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Define your feature columns
numeric_features = ['Age', 'Fare']  # Replace with actual numeric column names
categorical_features = ['Embarked', 'Sex']  # Replace with actual categorical column names

# Define transformations
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown='ignore')),
    ("selector", SelectPercentile(chi2, percentile=50))
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Build the final pipeline
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Pclass']), data['Pclass'], test_size=0.20, random_state=0)

# Fit and evaluate the model
clf.fit(X_train, y_train)
print("Model score: %.3f" % clf.score(X_test, y_test))


Model score: 0.702


# DictVectorizer

In [59]:
measurements = [{'city':'Dubai',
                 "temperature":33., 
                 'city':'London',
                 'temperature':12.,
                 'city':'San Francisco',
                 'temperature':18.}]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
vec.fit_transform(measurements).toarray()


array([[ 1., 18.]])

In [60]:
vec.get_feature_names_out()

array(['city=San Francisco', 'temperature'], dtype=object)