In [1]:
import pandas as pd

df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [2]:
df['income'].value_counts()

income
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [10]:
df = df.replace('?', pd.NA)
df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [4]:
df = df.dropna()

In [5]:
from sklearn.preprocessing import LabelEncoder

# Zakoduj zmienną docelową
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
# Wybór X i y
X = df.drop('income', axis=1)
y = df['income']
# One-hot encoding (dla zmiennych kategorycznych)
X = pd.get_dummies(X)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Grid Search z walidacją krzyżową
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)

# Ocena na danych testowych
y_pred = grid.predict(X_test)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.89      0.93      0.91      6842\n           1       0.75      0.63      0.68      2203\n\n    accuracy                           0.86      9045\n   macro avg       0.82      0.78      0.80      9045\nweighted avg       0.85      0.86      0.85      9045\n'

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Split numerical and categorical features
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

# Preprocessing
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])

In [9]:
import numpy as np
from sklearn.model_selection import cross_validate

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='rbf'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier()
}

# Scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Evaluate models
results = {}

for name, model in models.items():
    clf = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    scores = cross_validate(clf, X, y, cv=5, scoring=scoring)
    results[name] = {
        metric: np.mean(scores[f'test_{metric}']) for metric in scoring
    }
    results[name]['fit_time'] = np.mean(scores['fit_time'])
    results[name]['score_time'] = np.mean(scores['score_time'])

# Display results
results_df = pd.DataFrame(results).T
results_df.sort_values(by="f1", ascending=False)

Unnamed: 0,accuracy,precision,recall,f1,fit_time,score_time
Gradient Boosting,0.835302,0.78982,0.457262,0.579153,3.709749,0.041945
Random Forest,0.801601,0.618847,0.51945,0.564791,6.071874,0.220432
KNN,0.798616,0.621776,0.478497,0.540768,0.103619,1.0993
SVM,0.819601,0.78256,0.376872,0.508618,36.876849,15.659439
Logistic Regression,0.809053,0.705918,0.393467,0.505247,0.070572,0.032638
