In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC


In [2]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 

# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

df = X.join(y)

# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables)

# drop rows with missing values
df = df.dropna()

# drop duplicates
df = df.drop_duplicates()

# drop redundant columns
df = df.drop('education', axis=1)

# Format column, get rid of variations
df['income'] = df['income'].replace('>50K.', '>50K')
df['income'] = df['income'].replace('<=50K.', '<=50K')

# Binarize column
df['income'] = np.where(df['income'] == '>50K', 1, 0)
df.rename(columns={'income':'income>50K'}, inplace=True)

# Extract features and target
df2 = df.copy()
y2 = df2.iloc[:,-1:]
X2 = df2.iloc[:,1:-1]

# Create encoder
encoder = LabelEncoder()

# Encode categorical columns
X2['workclass'] = encoder.fit_transform(X2['workclass'])
X2['occupation'] = encoder.fit_transform(X2['occupation'])
X2['marital-status'] = encoder.fit_transform(X2['marital-status'])
X2['relationship'] = encoder.fit_transform(X2['relationship'])
X2['race'] = encoder.fit_transform(X2['race'])
X2['native-country'] = encoder.fit_transform(X2['native-country'])
X2['sex'] = encoder.fit_transform(X2['sex'])

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Initialize the Support Vector Machine model
model = SVC(kernel='linear')  # Using a linear kernel; you can try 'rbf' or others

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)


Accuracy: 0.8076478621703961
Confusion Matrix:
[[6993  181]
 [1650  695]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      7174
           1       0.79      0.30      0.43      2345

    accuracy                           0.81      9519
   macro avg       0.80      0.64      0.66      9519
weighted avg       0.81      0.81      0.77      9519

