# Importing the libraries

In [296]:

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,confusion_matrix

from sklearn.naive_bayes import GaussianNB


In [297]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 

# Reading The Dataset

In [298]:
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

# Info of the Dataset

In [299]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


# Replace commas to make two classes not four

In [300]:
y['income']=[i.replace('.','') for i in y['income']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income']=[i.replace('.','') for i in y['income']]


# Apply one-hot encoding to categorical variables

In [301]:

categorical_cols = X.select_dtypes(include=['object']).columns
inputs = pd.get_dummies(X[categorical_cols])


In [302]:
X=pd.concat([X,inputs],axis=1)
print(X)


       age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
48837   39           Private  215419  Bachelors             13   
48838   64               NaN  321403    HS-grad              9   
48839   38           Private  374983  Bachelors             13   
48840   44           Private   83891  Bachelors             13   
48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation    relationship  \
0           Never-married       Adm-clerical   Not-in-family   
1      Married-civ-spouse    Exec-managerial         Husband   
2              

# Drop Categorical Columns

In [303]:
X=X.drop(categorical_cols ,axis=1)
data=pd.concat([X,y],axis=1)
data.dropna(inplace=True)
print(data)

       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0       39   77516             13          2174             0              40   
1       50   83311             13             0             0              13   
2       38  215646              9             0             0              40   
3       53  234721              7             0             0              40   
4       28  338409             13             0             0              40   
...    ...     ...            ...           ...           ...             ...   
48837   39  215419             13             0             0              36   
48838   64  321403              9             0             0              40   
48839   38  374983             13             0             0              50   
48840   44   83891             13          5455             0              40   
48841   35  182148             13             0             0              60   

       workclass_?  workcla

# Split the data into training and testing sets

In [304]:

X_train, X_test, y_train, y_test = train_test_split(data.drop('income',axis=1), data['income'], test_size=0.3, random_state=42)

In [305]:
model=GaussianNB()
model.fit(X_train,y_train)
print(X_test)

       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
7762    18  423024              9             0             0              20   
23881   17  178953              8             0             0              20   
30507   25  348986              9             0             0              40   
28911   20  218215             10             0             0              30   
19484   47  244025              9             0             0              56   
...    ...     ...            ...           ...           ...             ...   
15938   53  158284              9             0             0              70   
27828   37  286146              9             0             0              40   
28449   61  227468              9             0             0              40   
5647    30  164190              9             0             0              38   
27058   30   48542              8             0             0              40   

       workclass_?  workcla

In [306]:
y_predict=model.predict(X_test)
print(y_predict)

['<=50K' '<=50K' '<=50K' ... '<=50K' '<=50K' '<=50K']


# accuracy score

In [307]:
print(accuracy_score(y_test,y_predict))


0.7956732409745445


# Confusion Matrix

In [308]:

conf_mat = confusion_matrix(y_test,y_predict)
print("Confusion matrix:",conf_mat)

tn, fp, fn, tp = conf_mat.ravel()

senstivity = tp / (tp+fn)
specifity = tn / (tn+fp)

print("Senstivity: ", senstivity)
print("Specifity: ", specifity)

Confusion matrix: [[10575   534]
 [ 2460  1084]]
Senstivity:  0.3058690744920993
Specifity:  0.9519308668647043


# prosterior prob of all data

In [309]:
prosterior_prob=model.predict_proba(data.drop('income',axis=1))
print(prosterior_prob[:,1])

[0.12458152 0.0042687  0.00940912 ... 0.02299824 0.99998551 0.02830668]


# prosterior prob of test data

In [310]:
prosterior_prob=model.predict_proba(X_test)
print(prosterior_prob[:,1])

[0.0003614  0.00033922 0.00311577 ... 0.01368053 0.00481139 0.00488423]
