# Importing the libraries

In [648]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import CategoricalNB
from ucimlrepo import fetch_ucirepo

In [649]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 

# Reading The Dataset

In [650]:
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
dataset=X.copy()
dataset['income']=y['income']
# metadata 
print(adult.metadata) 

# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

# Info of the Dataset

In [651]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


## Head of the dataset

In [652]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Replace commas to make two classes not four

In [653]:
dataset['income']=[i.replace('.','') for i in dataset['income']]

## Show duplicated rows 

In [654]:
dataset.duplicated().sum()

48

**# Drop duplicates #**

In [655]:
dataset.drop_duplicates(inplace=True)

In [656]:
dataset.duplicated().sum()

0

# Apply one-hot encoding to categorical variables

In [657]:
print(dataset.isnull().sum().to_string())

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0


**#  Identify columns with missing values for different data types #**

In [658]:
num_cols = dataset.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = dataset.select_dtypes(include=['object']).columns
categorical_cols=[i for i in categorical_cols if i != 'income']

**# Drop Na values #** 

In [659]:
dataset.dropna(inplace=True)
dataset=dataset.reset_index()
print(dataset.isnull().sum().to_string())

index             0
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0


**# Apply one hot coding #**

In [660]:
inputs = pd.get_dummies(dataset[categorical_cols])
print(inputs)

       workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0            False                  False                False   
1            False                  False                False   
2            False                  False                False   
3            False                  False                False   
4            False                  False                False   
...            ...                    ...                  ...   
47568        False                  False                False   
47569        False                  False                False   
47570        False                  False                False   
47571        False                  False                False   
47572        False                  False                False   

       workclass_Never-worked  workclass_Private  workclass_Self-emp-inc  \
0                       False              False                   False   
1                       False              False       

**# Concatination of data #** 

In [661]:
dataset=pd.concat([dataset,inputs],axis=1)
print(dataset)

       index  age         workclass  fnlwgt  education  education-num  \
0          0   39         State-gov   77516  Bachelors             13   
1          1   50  Self-emp-not-inc   83311  Bachelors             13   
2          2   38           Private  215646    HS-grad              9   
3          3   53           Private  234721       11th              7   
4          4   28           Private  338409  Bachelors             13   
...      ...  ...               ...     ...        ...            ...   
47568  48836   33           Private  245211  Bachelors             13   
47569  48837   39           Private  215419  Bachelors             13   
47570  48839   38           Private  374983  Bachelors             13   
47571  48840   44           Private   83891  Bachelors             13   
47572  48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation   relationship  \
0           Never-married       Adm-clerical  Not-in-family 

# Drop Categorical Columns

In [662]:
dataset=dataset.drop(categorical_cols ,axis=1)
dataset.drop('index',axis=1,inplace=True)
dataset.dropna(inplace=True)
print(dataset)


       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0       39   77516             13          2174             0              40   
1       50   83311             13             0             0              13   
2       38  215646              9             0             0              40   
3       53  234721              7             0             0              40   
4       28  338409             13             0             0              40   
...    ...     ...            ...           ...           ...             ...   
47568   33  245211             13             0             0              40   
47569   39  215419             13             0             0              36   
47570   38  374983             13             0             0              50   
47571   44   83891             13          5455             0              40   
47572   35  182148             13             0             0              60   

      income  workclass_?  

# Split the data into training and testing sets

In [663]:

X_train, X_test, y_train, y_test = train_test_split(dataset.drop('income',axis=1), dataset['income'], test_size=0.3, random_state=16)

**# Our Model Categorical Naive Bayes #**

In [664]:
model=CategoricalNB()
model.fit(X_train,y_train)
model.score(X_train,y_train)

0.865649680189784

In [665]:
y_predict=model.predict(X_test)
print(y_predict)

['>50K' '<=50K' '>50K' ... '<=50K' '>50K' '<=50K']


# accuracy score

In [666]:
print(accuracy_score(y_test,y_predict))

0.8346412556053812


# Confusion Matrix

In [667]:

conf_mat = confusion_matrix(y_test,y_predict)
print("Confusion matrix:",conf_mat)

tn, fp, fn, tp = conf_mat.ravel()

senstivity = tp / (tp+fn)
specifity = tn / (tn+fp)

print("Senstivity: ", senstivity)
print("Specifity: ", specifity)

Confusion matrix: [[9335 1402]
 [ 958 2577]]
Senstivity:  0.728995756718529
Specifity:  0.8694234888702617


# prosterior prob of all data

In [668]:
prosterior_prob=model.predict_proba(dataset.drop('income',axis=1))
print(prosterior_prob[:,1])

[2.19961668e-04 9.93890237e-01 9.52643138e-05 ... 9.88406098e-01
 9.07526400e-05 9.98903827e-01]


# prosterior prob of test data

In [669]:
prosterior_prob=model.predict_proba(X_test)
print(prosterior_prob[:,1])

[8.77374026e-01 1.66193737e-04 9.99921784e-01 ... 5.01416997e-07
 9.99981450e-01 1.49405881e-01]
