# Census Income

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Step 1: Load the datasets
train_file_path = 'Census_Income_Data/adult.csv'  # Update with your actual path
test_file_path = 'Census_Income_Data/adult.test.csv'  # Update with your actual path

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [3]:
# Step 2: Clean column names by removing leading/trailing spaces
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

In [4]:
# Step 3: Check for 'Income' column in both train and test datasets
print("Train DataFrame Columns after cleaning:", train_df.columns)
print("Test DataFrame Columns after cleaning:", test_df.columns)

# Ensure the 'Income' column exists in both datasets
if 'Income' not in test_df.columns:
    print("Income column not found in test_df, renaming the last column.")
    test_df.columns = train_df.columns

Train DataFrame Columns after cleaning: Index(['Age', 'Workclass', 'Final Weight', 'Education', 'EducationNum',
       'Marital Status', 'Occupation', 'Relationship', 'Race', 'Gender',
       'Capital Gain', 'capital loss', 'Hours per Week', 'Native Country',
       'Income'],
      dtype='object')
Test DataFrame Columns after cleaning: Index(['25', 'Private', '226802', '11th', '7', 'Never-married',
       'Machine-op-inspct', 'Own-child', 'Black', 'Male', '0', '0.1', '40',
       'United-States', '<=50K.'],
      dtype='object')
Income column not found in test_df, renaming the last column.


In [5]:
# Step 4: Inspect the 'Income' column values
print("Train DataFrame 'Income' column unique values before mapping:", train_df['Income'].unique())
print("Test DataFrame 'Income' column unique values before mapping:", test_df['Income'].unique())

Train DataFrame 'Income' column unique values before mapping: [' <=50K' ' >50K']
Test DataFrame 'Income' column unique values before mapping: [' <=50K.' ' >50K.']


In [6]:
# Step 5: Clean 'Income' values (remove periods) and map to binary values (<=50K: 0, >50K: 1)
train_df['Income'] = train_df['Income'].str.replace('.', '', regex=False).str.strip()
test_df['Income'] = test_df['Income'].str.replace('.', '', regex=False).str.strip()

train_df['income'] = train_df['Income'].map({'>50K': 1, '<=50K': 0})
test_df['income'] = test_df['Income'].map({'>50K': 1, '<=50K': 0})

In [7]:
# Step 6: Check if there are any NaN values in the 'income' columns after mapping
print(f"Missing values in train target (y_train) after mapping: {train_df['income'].isnull().sum()}")
print(f"Missing values in test target (y_test) after mapping: {test_df['income'].isnull().sum()}")

Missing values in train target (y_train) after mapping: 0
Missing values in test target (y_test) after mapping: 0


In [8]:
# Step 7: Handle any NaN values in y_test (fill with the mode)
if test_df['income'].isnull().sum() > 0:
    mode_y_test = test_df['income'].mode()[0]  # Get the most frequent value
    test_df['income'].fillna(mode_y_test, inplace=True)

# Step 8: Verify no missing values in the target variables
print(f"Missing values in train target (y_train) after filling: {train_df['income'].isnull().sum()}")
print(f"Missing values in test target (y_test) after filling: {test_df['income'].isnull().sum()}")

Missing values in train target (y_train) after filling: 0
Missing values in test target (y_test) after filling: 0


In [9]:
# Step 9: Prepare the features (X) and target (y)
X_train = train_df.drop('income', axis=1)  # Features (X) - dropping the target column
y_train = train_df['income']  # Target (y)
X_test = test_df.drop('income', axis=1)  # Features (X) - dropping the target column
y_test = test_df['income']  # Target (y)

# Ensure features in X_train and X_test match
X_test = X_test[X_train.columns]  # Reorder columns in X_test to match X_train

In [10]:
# Step 10: Preprocess the data (scaling numerical features, one-hot encoding categorical features)
categorical_cols = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country']

# Create a preprocessing pipeline for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Final Weight', 'EducationNum', 'Capital Gain', 'capital loss', 'Hours per Week']),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

# Apply the preprocessing pipeline to both training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [11]:
# Step 11: Verify the shapes of processed data
print("Processed Train feature matrix shape:", X_train_processed.shape)
print("Processed Test feature matrix shape:", X_test_processed.shape)

Processed Train feature matrix shape: (32561, 100)
Processed Test feature matrix shape: (16280, 100)


In [12]:
# Step 12: Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
model.fit(X_train_processed, y_train)

In [13]:
# Step 13: Make predictions on the test set
y_pred = model.predict(X_test_processed)

In [14]:
# Step 14: Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.8091523341523341
Confusion Matrix:
[[9937 2497]
 [ 610 3236]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.80      0.86     12434
           1       0.56      0.84      0.68      3846

    accuracy                           0.81     16280
   macro avg       0.75      0.82      0.77     16280
weighted avg       0.85      0.81      0.82     16280

