In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics
import warnings

warnings.filterwarnings("ignore")


In [None]:
#Read the csv file and with the head() function, preview the first 5 rows

adult_df = pd.read_csv("/kaggle/input/adult-census-income/adult.csv")
adult_df.head(20)

In [None]:
#It seems that this dataset has 32561 rows with 15 features
adult_df.shape

In [None]:
#The below code will shows the statistical summary for this dataset
adult_df.describe()

In [None]:
adult_df = adult_df.replace('?', pd.np.nan)
#The code will check the sum of NaN values for each feature.
adult_df.isnull().sum()

In [None]:
#By using below code, perview the data type of each feature 
adult_df.dtypes

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(30, 15))

sns.countplot(x="sex", hue="workclass", data=adult_df, ax=ax[0, 0])
sns.countplot(x="sex", hue="education", data=adult_df, ax=ax[0, 1])
sns.countplot(x="sex", hue="marital.status", data=adult_df, ax=ax[0, 2])
sns.countplot(x="sex", hue="occupation", data=adult_df, ax=ax[0, 3])
sns.countplot(x="sex", hue="relationship", data=adult_df, ax=ax[1, 0])
sns.countplot(x="sex", hue="race", data=adult_df, ax=ax[1, 1])
sns.countplot(x="sex", hue="native.country", data=adult_df, ax=ax[1, 2])
sns.countplot(x="sex", hue="income", data=adult_df, ax=ax[1, 3])

# Add space between the subplots
fig.subplots_adjust(wspace=0.5, hspace=1.2)

# Loop through each axis object and adjust the position of its legend
for i in range(len(ax)):
    for j in range(len(ax[i])):
        handles, labels = ax[i,j].get_legend_handles_labels()
        ax[i,j].legend(handles, labels, loc='center right', bbox_to_anchor=(1.3, 0.7), ncol=1)

plt.show()


In [None]:
adult_df["workclass"].unique()

In [None]:
# Convert non-numeric values
adult_df["sex"].replace({'Male': 1, 'Female': 0}, inplace=True)
adult_df["workclass"].replace({'Private': 0, 'State-gov': 1,'Federal-gov': 2,'Self-emp-not-inc': 3,
                         'Self-emp-inc': 4, 'Local-gov': 5, 'Without-pay': 6, 'Never-worked':7 }, inplace=True)

adult_df["education"].replace({'HS-grad': 0, '7th-8th': 1,'Some-college': 2,'10th': 3,
                         'Doctorate': 4, 'Prof-school': 5, 'Bachelors': 6,'Masters': 7, '11th': 8,'Assoc-voc': 9,'1st-4th': 10,
                         '5th-6th': 11, 'Assoc-acdm': 12, '12th': 13,'9th': 14,'Preschool': 15}, inplace=True)

adult_df["marital.status"].replace({'Widowed': 0, 'Divorced': 1,'Separated': 2,'Never-married': 3,
                         'Married-civ-spouse': 4, 'Married-spouse-absent': 5, 'Married-AF-spouse': 6 }, inplace=True)

adult_df["occupation"].replace({'Exec-managerial': 0, 'Machine-op-inspct': 1,'Prof-specialty': 2,'Other-service': 3,
                         'Adm-clerical': 4, 'Transport-moving': 5, 'Sales': 6,'Craft-repair': 7, 'Farming-fishing': 8,'Tech-support': 9,'Protective-serv': 10,
                         'Handlers-cleaners': 11, 'Armed-Forces': 12, 'Priv-house-serv': 13}, inplace=True)

adult_df["relationship"].replace({'Not-in-family': 0, 'Unmarried': 1, 'Own-child': 2, 'Other-relative': 3,
                             'Husband': 4,'Wife': 5 }, inplace=True)

adult_df["education"].replace({'HS-grad': 0, '7th-8th': 1,'Some-college': 2,'10th': 3,
                         'Doctorate': 4, 'Prof-school': 5, 'Bachelors': 6,'Masters': 7, '11th': 8,'Assoc-voc': 9,'1st-4th': 10,
                         '5th-6th': 11, 'Assoc-acdm': 12, '12th': 13,'9th': 14,'Preschool': 15}, inplace=True)

adult_df["native.country"].replace({'United-States':0, 'Mexico':1, 'Greece':2, 'Vietnam':3, 'China':4, 'Taiwan':5, 'India':6,
 'Philippines':7, 'Trinadad&Tobago':8, 'Canada':9, 'South':10, 'Holand-Netherlands':11,
 'Puerto-Rico':12, 'Poland':13, 'Iran':14, 'England':15, 'Germany':16, 'Italy':17, 'Japan':18, 'Hong':19,
 'Honduras':20, 'Cuba':21, 'Ireland':22, 'Cambodia':23, 'Peru':24, 'Nicaragua':25,
 'Dominican-Republic':26, 'Haiti' :27,'Hungary' :28,'Columbia':29, 'Guatemala':30,
 'El-Salvador':31, 'Jamaica':32, 'Ecuador':33, 'France':34, 'Yugoslavia':35, 'Portugal':36, 'Laos':37,
 'Thailand':38, 'Outlying-US(Guam-USVI-etc)':39, 'Scotland':40}, inplace=True)

adult_df["income"].replace({'<=50K': 0, '>50K': 1}, inplace=True)

adult_df["race"].replace({'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Other':3,
       'Amer-Indian-Eskimo':4}, inplace=True)


adult_df["race"].replace({'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Other':3,
       'Amer-Indian-Eskimo':4}, inplace=True)

adult_df.head()

In [None]:
adult_df.isnull().sum()

In [None]:
#Replace null values
adult_df["workclass"].fillna(value=adult_df['workclass'].mode()[0],inplace=True)

adult_df["occupation"].fillna(value=adult_df['occupation'].mode()[0],inplace=True)

adult_df["native.country"].fillna(value=adult_df['native.country'].mode()[0],inplace=True)

In [None]:
adult_df.isnull().sum()

In [None]:
#Generate a correlation matrix
corr_matrix = adult_df.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
#Create the attribute and target data:

X=adult_df.drop(columns=['income'],axis=1)
Y=adult_df['income']
X.head()

In [None]:
#Split the data into training and test data using train_test_split() function. :

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1, stratify=Y, random_state=2)
print(X.shape,X_train.shape,X_test.shape)

In [None]:
adult_df.dtypes

In [None]:
#Create an SVM model with a linear kernel:

model = svm.SVC(kernel='linear')
model.fit(X_train,Y_train)