In [1]:
import pandas as pd

df = pd.read_csv("../data/income/adult.data")

In [2]:
df.shape

(32560, 15)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   39              32560 non-null  int64 
 1    State-gov      32560 non-null  object
 2    77516          32560 non-null  int64 
 3    Bachelors      32560 non-null  object
 4    13             32560 non-null  int64 
 5    Never-married  32560 non-null  object
 6    Adm-clerical   32560 non-null  object
 7    Not-in-family  32560 non-null  object
 8    White          32560 non-null  object
 9    Male           32560 non-null  object
 10   2174           32560 non-null  int64 
 11   0              32560 non-null  int64 
 12   40             32560 non-null  int64 
 13   United-States  32560 non-null  object
 14   <=50K          32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [5]:
# เนื่องจากข้อมูลก่อนหน้าไม่มี feature label จะทำการเรียกข้อมูลใหม่โดยใส่ feature label จาก adult.name
def read_adult_names_file(file_path):
    with open(file_path, "r") as file:
        content = file.read()

    feature_section = content.split(">50K, <=50K.")[1].strip()

    features = []

    for line in feature_section.split("\n"):
        if line.strip():
            parts = line.split(": ")
            if len(parts) == 2:
                feature_name = parts[0].strip()
                features.append(feature_name)

    features.append("income")

    return features


column_names = read_adult_names_file("../data/income/adult.names")
df = pd.read_csv(
    "../data/income/adult.data",
    header=None,
    names=column_names,
    sep=", ",
    engine="python",
)

In [6]:
df.head()
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [7]:
for column in df.columns:
    j = df[column].value_counts(dropna=False)
    print(column, ":", j)
    print("----------------------------------------")

age : age
36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: count, Length: 73, dtype: int64
----------------------------------------
workclass : workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
----------------------------------------
fnlwgt : fnlwgt
164190    13
203488    13
123011    13
148995    12
121124    12
          ..
232784     1
325573     1
140176     1
318264     1
257302     1
Name: count, Length: 21648, dtype: int64
----------------------------------------
education : education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school      

In [8]:
for column in df.columns:
    df.drop(df[df[column] == "?"].index, inplace=True)
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [9]:
def map_data(x):
    convert = x.unique()
    return x.map(dict(zip(convert, range(1, len(convert) + 1))))


for column in df.columns:
    if df[column].dtype == "object":
        df[column] = map_data(df[column])
df.astype(int)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,1,77516,1,13,1,1,1,1,1,2174,0,40,1,1
1,50,2,83311,1,13,2,2,2,1,1,0,0,13,1,1
2,38,3,215646,2,9,3,3,1,1,1,0,0,40,1,1
3,53,3,234721,3,7,2,3,2,2,1,0,0,40,1,1
4,28,3,338409,1,13,2,4,3,2,2,0,0,40,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,3,257302,7,12,2,10,3,1,2,0,0,38,1,1
32557,40,3,154374,2,9,2,9,2,1,1,0,0,40,1,2
32558,58,3,151910,2,9,7,1,5,1,2,0,0,40,1,1
32559,22,3,201490,2,9,1,1,4,1,1,0,0,20,1,1


In [10]:
df_test = pd.read_csv(
    "../data/income/adult.test",
    header=None,
    names=column_names,
    sep=", ",
    engine="python",
)
df_test.isnull().sum()
df_test.dropna(inplace=True)

In [11]:
for column in df_test.columns:
    df_test.drop(df_test[df_test[column] == "?"].index, inplace=True)

for column in df_test.columns:
    if df_test[column].dtype == "object":
        df_test[column] = map_data(df_test[column])
df_test.astype(int)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
1,1,1,226802,1,7,1,1,1,1,1,0,0,40,1,1
2,2,1,89814,2,9,2,2,2,2,1,0,0,50,1,1
3,3,2,336951,3,12,2,3,2,2,1,0,0,40,1,2
4,4,1,160323,4,10,2,1,2,1,1,7688,0,40,1,2
6,5,1,198693,5,6,1,4,3,2,1,0,0,30,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,30,1,245211,8,13,1,5,1,2,1,0,0,40,1,1
16277,25,1,215419,8,13,5,5,3,2,2,0,0,36,1,1
16279,2,1,374983,8,13,2,5,2,2,1,0,0,50,1,1
16280,4,1,83891,8,13,5,7,1,5,1,5455,0,40,1,1


In [12]:
X_train = df.drop("income", axis=1)
y_train = df["income"]

X_test = df_test.drop("income", axis=1)
y_test = df_test["income"]

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import joblib

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


svm = SVC(kernel="rbf", C=1.0, gamma="scale", probability=True, random_state=42)


if len(y_train.shape) > 1:
    y_train = y_train.flatten()
if len(y_test.shape) > 1:
    y_test = y_test.flatten()

svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


probabilities = svm.predict_proba(X_test_scaled)


joblib.dump(svm, "../exported_models/svm/svm_income_model.pkl")
joblib.dump(scaler, "../exported_models/svm/svm_income_scaler.pkl")