In [1]:
import pandas as pd

# Указать путь к данным
data_path = "C:/Users/anya8/05_src/data/adult/adult.data"

# Загрузить данные в DataFrame
columns = ["age", "workclass", "fnlwgt", "education", "education-num", 
           "marital-status", "occupation", "relationship", "race", 
           "sex", "capital-gain", "capital-loss", "hours-per-week", 
           "native-country", "income"]

df = pd.read_csv(data_path, header=None, names=columns, na_values=" ?")

# Проверить загруженные данные
print(df.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [5]:
import pandas as pd

# Define the column names for the dataset
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
    'native-country', 'income'
]

# Load the adult data from the CSV file located at the specified path
adult_dt = (pd.read_csv('C:/Users/anya8/05_src/data/adult/adult.data', header=None, names=columns)
              .assign(income=lambda x: (x.income.str.strip() == '>50K') * 1))

# Display the first few rows of the dataset to verify the loading process
print(adult_dt.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States       

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Create the features dataframe (X) and target dataframe (Y)
X = adult_dt.drop(columns=['income'])  # Features dataframe excluding the target variable
Y = adult_dt['income']  # Target variable (income)

# Split the data into training and testing sets (70-30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Display the shapes of the resulting datasets
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

# Comments:
# The random state of the splitting function is a parameter that controls the shuffling applied to the data
# before splitting. Setting it to a specific number (e.g., 42) ensures that the results are reproducible.
# It is useful because it allows for consistent splitting of the data across different runs, 
# making it easier to debug and compare results.

Training data shape: (22792, 14), Test data shape: (9769, 14)


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer  # Import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define numerical and categorical columns
numerical_cols = [
    'age', 'fnlwgt', 'education-num', 'capital-gain', 
    'capital-loss', 'hours-per-week'
]
categorical_cols = [
    'workclass', 'education', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'native-country'
]

# Create a Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 
         Pipeline(steps=[
             ('imputer', KNNImputer(n_neighbors=7, weights='distance')),
             ('scaler', StandardScaler())
         ]), numerical_cols),
        
        ('cat', 
         Pipeline(steps=[
             ('imputer', SimpleImputer(strategy='most_frequent')),  # Most frequent imputation for categorical
             ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))  # One-hot encoding
         ]), categorical_cols)
    ]
)

# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)
# Transform the test data (without fitting again)
X_test_transformed = preprocessor.transform(X_test)

# Check the shape of the transformed data
print(f"Transformed training data shape: {X_train_transformed.shape}")
print(f"Transformed testing data shape: {X_test_transformed.shape}")

Transformed training data shape: (22792, 100)
Transformed testing data shape: (9769, 100)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [10]:
# Assuming you have X and Y defined earlier
X = adult_dt.drop(columns=['income'])  # Features dataframe
Y = adult_dt['income']  # Target variable

# Split the data into training and testing sets (70-30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [11]:
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Fit the model on the training data
model.fit(X_train_transformed, Y_train)

In [12]:
# Make predictions on the test set
Y_pred = model.predict(X_test_transformed)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
print("Classification Report:")
print(classification_report(Y_test, Y_pred))

# Generate a confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7455
           1       0.74      0.60      0.66      2314

    accuracy                           0.86      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.86      0.85      9769

Confusion Matrix:
[[6958  497]
 [ 915 1399]]


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

# Define numerical and categorical columns
numerical_cols = [
    'age', 'fnlwgt', 'education-num', 'capital-gain', 
    'capital-loss', 'hours-per-week'
]
categorical_cols = [
    'workclass', 'education', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'native-country'
]

# Create a Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 
         Pipeline(steps=[
             ('imputer', KNNImputer(n_neighbors=7, weights='distance')),
             ('scaler', RobustScaler())
         ]), numerical_cols),

        ('cat', 
         Pipeline(steps=[
             ('imputer', SimpleImputer(strategy='most_frequent')),  # Most frequent imputation for categorical
             ('onehot', OneHotEncoder(handle_unknown='ignore', drop='if_binary'))  # One-hot encoding
         ]), categorical_cols)
    ]
)


In [17]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)
# Transform the test data (without fitting again)
X_test_transformed = preprocessor.transform(X_test)

# Model fitting and evaluation (as before)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_transformed, Y_train)
Y_pred = model.predict(X_test_transformed)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7455
           1       0.74      0.61      0.67      2314

    accuracy                           0.86      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.86      0.85      9769

Confusion Matrix:
[[6952  503]
 [ 909 1405]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train_transformed, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

# Define numerical and categorical columns
numerical_cols = [
    'age', 'fnlwgt', 'education-num', 'capital-gain', 
    'capital-loss', 'hours-per-week'
]
categorical_cols = [
    'workclass', 'education', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'native-country'
]

# Create the numerical and categorical transformers
num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=7, weights='distance')),
    ('scaler', RobustScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transforms', num_transformer, numerical_cols),
        ('cat_transforms', cat_transformer, categorical_cols)
    ]
)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Fit the preprocessor to the training data and transform
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Initialize LogisticRegression with increased max_iter
model = LogisticRegression(max_iter=2000, solver='saga')  # 'saga' solver is robust for larger datasets
model.fit(X_train_transformed, Y_train)

# Predict and evaluate
Y_pred = model.predict(X_test_transformed)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

Accuracy: 0.77
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      7455
           1       0.52      0.40      0.45      2314

    accuracy                           0.77      9769
   macro avg       0.68      0.64      0.65      9769
weighted avg       0.76      0.77      0.76      9769

Confusion Matrix:
[[6617  838]
 [1389  925]]




In [22]:
model = LogisticRegression(max_iter=5000, solver='saga')

In [23]:
model = LogisticRegression(max_iter=5000, solver='newton-cg')

In [24]:
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=7, weights='distance')),
    ('scaler', StandardScaler())
])


In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_transformed, Y_train)

In [27]:
print(preprocessor)

ColumnTransformer(transformers=[('num_transforms',
                                 Pipeline(steps=[('imputer',
                                                  KNNImputer(n_neighbors=7,
                                                             weights='distance')),
                                                 ('scaler', RobustScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('cat_transforms',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'))]),
                           

In [None]:
model = LogisticRegression(max_iter=5000, solver='saga', class_weight='balanced')


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Создаем модельный конвейер
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),  # Используем ColumnTransformer, созданный ранее
    ('classifier', RandomForestClassifier(random_state=42))  # RandomForestClassifier
])

# Обучаем модель на тренировочных данных
model_pipeline.fit(X_train, Y_train)

# Делаем предсказания на тестовых данных
Y_pred = model_pipeline.predict(X_test)

# Оцениваем модель
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(Y_test, Y_pred))

print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7455
           1       0.73      0.63      0.68      2314

    accuracy                           0.86      9769
   macro avg       0.81      0.78      0.79      9769
weighted avg       0.85      0.86      0.85      9769

Confusion Matrix:
[[6909  546]
 [ 856 1458]]


In [29]:
from sklearn.model_selection import cross_validate

# Оценка модели с помощью кросс-валидации
cv_results = cross_validate(model_pipeline, X_train, Y_train, cv=5, 
                             scoring=['neg_log_loss', 'roc_auc', 'accuracy', 'balanced_accuracy'],
                             return_train_score=True)

# Преобразуем результаты в pandas DataFrame для удобства визуализации
cv_results_df = pd.DataFrame(cv_results)

# Сортируем результаты по negative log loss на тестовом (валидационном) наборе
cv_results_df_sorted = cv_results_df.sort_values(by='test_neg_log_loss')

# Отображаем отсортированные результаты
print(cv_results_df_sorted)



   fit_time  score_time  test_neg_log_loss  train_neg_log_loss  test_roc_auc  \
2  7.950742    0.186199          -0.399232           -0.081144      0.903361   
1  7.917442    0.188098          -0.387317           -0.081072      0.901907   
4  8.030086    0.212924          -0.379476           -0.081183      0.902318   
3  8.083104    0.192532          -0.363069           -0.081288      0.907232   
0  7.835253    0.191042          -0.341857           -0.081550      0.905032   

   train_roc_auc  test_accuracy  train_accuracy  test_balanced_accuracy  \
2            1.0       0.855200        0.999890                0.775818   
1            1.0       0.847993        1.000000                0.767523   
4            1.0       0.856516        1.000000                0.776379   
3            1.0       0.863098        1.000000                0.788723   
0            1.0       0.852380        0.999945                0.776872   

   train_balanced_accuracy  
2                 0.999851  
1         

In [30]:
# Вычисляем среднее значение каждой метрики
mean_metrics = cv_results_df.mean()
print("Mean metrics:")
print(mean_metrics)

Mean metrics:
fit_time                   7.963325
score_time                 0.194159
test_neg_log_loss         -0.374190
train_neg_log_loss        -0.081247
test_roc_auc               0.903970
train_roc_auc              1.000000
test_accuracy              0.855037
train_accuracy             0.999967
test_balanced_accuracy     0.777063
train_balanced_accuracy    0.999948
dtype: float64


In [31]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score, balanced_accuracy_score

# Получаем вероятности предсказаний
Y_pred_proba = model_pipeline.predict_proba(X_test)

# Рассчитываем метрики производительности на тестовых данных
neg_log_loss_test = log_loss(Y_test, Y_pred_proba)
roc_auc_test = roc_auc_score(Y_test, Y_pred_proba[:, 1])
accuracy_test = accuracy_score(Y_test, model_pipeline.predict(X_test))
balanced_accuracy_test = balanced_accuracy_score(Y_test, model_pipeline.predict(X_test))

# Создаем словарь с результатами
results_dict = {
    'Negative Log Loss': neg_log_loss_test,
    'ROC AUC': roc_auc_test,
    'Accuracy': accuracy_test,
    'Balanced Accuracy': balanced_accuracy_test
}

# Отображаем результаты
print("Test performance metrics:")
print(results_dict)

Test performance metrics:
{'Negative Log Loss': 0.38733674303408167, 'ROC AUC': 0.9018825137514803, 'Accuracy': 0.8564847988535162, 'Balanced Accuracy': 0.7784191753807199}


Recoding the target variable income immediately after loading the data is convenient because it allows for straightforward processing of the categorical variable into a binary format (0 and 1). This transformation simplifies subsequent modeling tasks since many machine learning algorithms require numerical input. By doing this during data loading, we ensure that the dataset is ready for analysis and modeling without requiring additional preprocessing steps later on.