## Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## Load the dataset

In [2]:
# Assuming 'train.csv' and 'test.csv' are the file names for your training and testing datasets
train_data = pd.read_csv('SalaryData_Train.csv')
test_data = pd.read_csv('SalaryData_test.csv')
print(train_data)

       age          workclass    education  educationno        maritalstatus  \
0       39          State-gov    Bachelors           13        Never-married   
1       50   Self-emp-not-inc    Bachelors           13   Married-civ-spouse   
2       38            Private      HS-grad            9             Divorced   
3       53            Private         11th            7   Married-civ-spouse   
4       28            Private    Bachelors           13   Married-civ-spouse   
...    ...                ...          ...          ...                  ...   
30156   27            Private   Assoc-acdm           12   Married-civ-spouse   
30157   40            Private      HS-grad            9   Married-civ-spouse   
30158   58            Private      HS-grad            9              Widowed   
30159   22            Private      HS-grad            9        Never-married   
30160   52       Self-emp-inc      HS-grad            9   Married-civ-spouse   

               occupation    relationsh

## remove unwanted columns and continous data features from dataset

In [3]:
train_data = train_data.drop(columns = ['age','educationno','capitalgain','capitalloss','hoursperweek'])
test_data = test_data.drop(columns = ['age','educationno','capitalgain','capitalloss','hoursperweek'])

## Split data for training and testing

In [4]:
# Extract features (X) and target variable (y) from the training data
X_train = train_data.drop('Salary', axis=1)
y_train = train_data['Salary']

# Extract features (X) and target variable (y) from the testing data
X_test = test_data.drop('Salary', axis=1)
y_test = test_data['Salary']

## using DictVectorizer

In [5]:
# Convert categorical features to numerical using DictVectorizer
vectorizer = DictVectorizer(sparse=False)
X_train_encoded = vectorizer.fit_transform(X_train.to_dict(orient='records'))
X_test_encoded = vectorizer.transform(X_test.to_dict(orient='records'))

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_encoded)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the evaluation metrics
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.7940239043824702
Classification Report:
               precision    recall  f1-score   support

       <=50K       0.91      0.81      0.86     11360
        >50K       0.56      0.74      0.64      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.78      0.75     15060
weighted avg       0.82      0.79      0.80     15060



## using FeatureHasher

In [6]:
from sklearn.feature_extraction import FeatureHasher

# Convert categorical features to numerical using FeatureHasher
hasher = FeatureHasher(n_features=9, input_type='dict')
X_train_encoded = hasher.transform(X_train.to_dict(orient='records'))
X_test_encoded = hasher.transform(X_test.to_dict(orient='records'))

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(abs(X_train_encoded.toarray()), y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(abs(X_test_encoded.toarray()))

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the evaluation metrics
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)


Accuracy: 0.7737715803452855
Classification Report:
               precision    recall  f1-score   support

       <=50K       0.79      0.95      0.86     11360
        >50K       0.61      0.22      0.33      3700

    accuracy                           0.77     15060
   macro avg       0.70      0.59      0.59     15060
weighted avg       0.75      0.77      0.73     15060

