First step, import libraries and classes.

In [34]:
import pandas as pd # library
import openml # library

from sklearn.preprocessing import StandardScaler # class... and so on.
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from category_encoders.binary import BinaryEncoder
from category_encoders.hashing import HashingEncoder

Next download and import data set from open source, and create dataframe.

In [35]:
dataset = openml.datasets.get_dataset(1590)
adult_census, *_ = dataset.get_data()
adult_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802.0,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40,United-States,<=50K
1,38,Private,89814.0,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50,United-States,<=50K
2,28,Local-gov,336951.0,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40,United-States,>50K
3,44,Private,160323.0,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40,United-States,>50K
4,18,,103497.0,Some-college,10,Never-married,,Own-child,White,Female,0.0,0.0,30,United-States,<=50K


now create a schema to easily read data and meta data 

In [36]:
schema = [
    {
    'Columns' : feature.name,
    'Data Type': feature.data_type,
    'Is Categorical': "Yes" if feature.data_type == "nominal" else "No",
    'Possible Values' : feature.nominal_values if feature.data_type == "nominal" else "N/A",
} for feature in dataset.features.values() 
]


In [60]:
schema_df = pd.DataFrame(schema)
schema_df



Unnamed: 0,Columns,Data Type,Is Categorical,Possible Values
0,age,numeric,No,
1,workclass,nominal,Yes,"[Federal-gov, Local-gov, Never-worked, Private..."
2,fnlwgt,numeric,No,
3,education,nominal,Yes,"[10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, ..."
4,education-num,numeric,No,
5,marital-status,nominal,Yes,"[Divorced, Married-AF-spouse, Married-civ-spou..."
6,occupation,nominal,Yes,"[Adm-clerical, Armed-Forces, Craft-repair, Exe..."
7,relationship,nominal,Yes,"[Husband, Not-in-family, Other-relative, Own-c..."
8,race,nominal,Yes,"[Amer-Indian-Eskimo, Asian-Pac-Islander, Black..."
9,sex,nominal,Yes,"[Female, Male]"


In [38]:
target_name = 'class'
target = adult_census[target_name]  # Extract the target column first
adult_census = adult_census.drop(columns=[target_name])  # Drop the column without inplace=True
print(adult_census.columns)  # Verify that 'class' is removed
target


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')


0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
48837    <=50K
48838     >50K
48839    <=50K
48840    <=50K
48841     >50K
Name: class, Length: 48842, dtype: category
Categories (2, object): ['>50K' < '<=50K']

Extract all existing numerical columns and create an array of type, these will be scaled and applied to updated dataframe.

In [39]:
numeric_cols = adult_census.select_dtypes(include=['int64', 'float64']).columns.tolist() # use select_dtypes to extract columns with float and integer values. 

Remove redundent cols, e.g education and education nums can be merged and intepreted as the same.

In [40]:
adult_census_data = adult_census.drop(columns= ['education'])
print(type(adult_census_data))  # Should be <class 'pandas.DataFrame'>
print(adult_census_data.shape)  # Ensure it has more than one column
print(type(target))  # Should be <class 'pandas.Series'>
print(target.shape)  # Should be (n_samples,)



<class 'pandas.core.frame.DataFrame'>
(48842, 13)
<class 'pandas.core.series.Series'>
(48842,)


In [41]:
adult_census_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

require hashing encoder wrapped in function transforms. and for frequency encoding

In [57]:
# need to revist and learn all this properly!!!!

def hash_encode(X):
    encoder = HashingEncoder(n_components=8)  # Adjust the number of hash features,check hashing formula and table for collisions against computains required.
    return encoder.fit_transform(X)

hash_transformer = FunctionTransformer(hash_encode, validate=False)

# Function for Frequency Encoding
# def frequency_encode(X):
#     # Compute frequency of each category
#     freq_map = X.value_counts(normalize=True).to_dict()
#     encoded = X.map(freq_map)
#     # Ensure the result is a DataFrame with the same index and a proper column name
#     return pd.DataFrame(encoded, columns=[X.name], index=X.index)


# # Wrap in FunctionTransformer
# freq_transformer = FunctionTransformer(lambda X: X.apply(frequency_encode), validate=False)

#adult_census_data = adult_census_data.drop(columns= ['native-country'])
adult_census_data



Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week
0,25,Private,226802.0,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40
1,38,Private,89814.0,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50
2,28,Local-gov,336951.0,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40
3,44,Private,160323.0,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40
4,18,,103497.0,10,Never-married,,Own-child,White,Female,0.0,0.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302.0,12,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38
48838,40,Private,154374.0,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40
48839,58,Private,151910.0,9,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40
48840,22,Private,201490.0,9,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20


In [61]:
'''Creat an object *preprocesser_transformer* of class type ColumnTransformer, 
pass an argument of transformer method with all the encoder objects for each column and scaler objects '''

preprocessor_transformer = ColumnTransformer(transformers=
[
    ('workclass', BinaryEncoder(), ['workclass']),
    ('marital-status', OneHotEncoder(), ['marital-status']),
    ('occupation', hash_transformer, ['occupation']),
    ('relationship', OneHotEncoder(), ['relationship']),
    ('race', OneHotEncoder(), ['race']),
    ('sex', OneHotEncoder(), ['sex']),
    ('scaler', StandardScaler(), numeric_cols)
],
    remainder='passthrough'  # Keep any remaining columns unchanged
)

In [62]:
# transformed_data = preprocessor_transformer.fit_transform(adult_census_data)
# print(type(transformed_data))  # Should be numpy.ndarray or DataFrame
# print(transformed_data.shape)  # Should be (48842, n_transformed_features)
for name, transformer, cols in preprocessor_transformer.transformers:
    print(f"Testing transformer: {name}")
    try:
        transformed = transformer.fit_transform(adult_census_data[cols])
        print(f"Output type: {type(transformed)}")
        print(f"Output shape: {transformed.shape}\n")
    except Exception as e:
        print(f"Error in transformer {name}: {e}\n")


Testing transformer: workclass
Output type: <class 'pandas.core.frame.DataFrame'>
Output shape: (48842, 4)

Testing transformer: marital-status
Output type: <class 'scipy.sparse._csr.csr_matrix'>
Output shape: (48842, 7)

Testing transformer: occupation
Output type: <class 'pandas.core.frame.DataFrame'>
Output shape: (48842, 8)

Testing transformer: relationship
Output type: <class 'scipy.sparse._csr.csr_matrix'>
Output shape: (48842, 6)

Testing transformer: race
Output type: <class 'scipy.sparse._csr.csr_matrix'>
Output shape: (48842, 5)

Testing transformer: sex
Output type: <class 'scipy.sparse._csr.csr_matrix'>
Output shape: (48842, 2)

Testing transformer: scaler
Output type: <class 'numpy.ndarray'>
Output shape: (48842, 3)



instantiate object *model using pipe line class type

In [75]:
model = Pipeline(steps= [
    ('preprocessor', preprocessor_transformer),
    ('classifier', LogisticRegression(max_iter=5000))
])

cross validate using training data.


In [76]:
cv_results = cross_validate(model, adult_census_data, target)
cv_results

{'fit_time': array([4.76848412, 2.0944612 , 4.67209649, 4.67712665, 4.5478096 ]),
 'score_time': array([0.08010674, 0.07871556, 0.0790782 , 0.08057475, 0.07948351]),
 'test_score': array([0.84563415, 0.84420104, 0.84449222, 0.84797297, 0.8519656 ])}

In [77]:
scores = cv_results["test_score"]
# scores.mean give the probiltiy of prediction to 3 decimal places,  with scores.std giving the standard deviation to 3 decimal places.
print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")

The accuracy is: 0.847 ± 0.003
