In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

le = LabelEncoder()
ss = StandardScaler()
rfc = RandomForestClassifier(n_estimators = 50, criterion = 'entropy')

df = pd.read_csv('adult_data.csv')
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']
for i in df.columns:
    for j in range(len(df)):
        if df[i][j] == ' ?':
            df[i][j] = np.nan
df.dropna(axis = 0, inplace = True)
df.reset_index(drop = True, inplace = True)
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'salary']
for cols in cat_cols:
    df[cols] = le.fit_transform(df[cols])
x = df.drop('salary', axis = 1)
y = df.salary
x = ss.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
model = rfc.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('\nAccuracy score = ', accuracy_score(y_test, y_pred))
print('\nPrecision score = ', precision_score(y_test, y_pred))
print('\nRecall score = ', recall_score(y_test, y_pred))
print('\nf1 score = ', f1_score(y_test, y_pred))
print('\nConfusion matrix:\n\n', confusion_matrix(y_test, y_pred))
print('\nClassification report:\n\n', classification_report(y_test, y_pred))
plt.figure(figsize = (8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, cmap = 'magma')
plt.show()