<a href="https://colab.research.google.com/github/Manith-Ratnayake/Machine_Learning/blob/main/Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
file_paths = [
    '/content/drive/My Drive/Machine Learning/old.adult.names',
    '/content/drive/My Drive/Machine Learning/Index',
    '/content/drive/My Drive/Machine Learning/adult.test',
    '/content/drive/My Drive/Machine Learning/adult.names',
    '/content/drive/My Drive/Machine Learning/adult.data'
]

for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = file.read()
        print(f"Content of {file_path}: successful")


Content of /content/drive/My Drive/Machine Learning/old.adult.names: successful
Content of /content/drive/My Drive/Machine Learning/Index: successful
Content of /content/drive/My Drive/Machine Learning/adult.test: successful
Content of /content/drive/My Drive/Machine Learning/adult.names: successful
Content of /content/drive/My Drive/Machine Learning/adult.data: successful


### File Reading

In [25]:
import pandas as pd

# Replace 'adult.data' with the actual path to your file if it's not in the working directory
file_path = '/content/drive/My Drive/Machine Learning/adult.data'
file_data = pd.read_csv(file_path, delimiter=',', header=None)  # Adjust delimiter as necessary

# Since the file may not contain headers, you might need to manually specify them
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                'hours-per-week', 'native-country', 'income']
file_data.columns = column_names

In [26]:
# Checking the first few rows
print(file_data.head())

# Summarize statistics for numerical features
print(file_data.describe())

# Distribution of the target variable
print(file_data['income'].value_counts())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [27]:
# Replace '?' with NaN and then check for missing values
import numpy as np

file_data = file_data.replace('?', np.nan)

# Checking for missing values
print(file_data.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [29]:
from sklearn.preprocessing import OneHotEncoder

# Assuming 'file_data' is your DataFrame and 'income' is the target variable
categorical_cols = file_data.columns[file_data.dtypes==object].tolist()
categorical_cols.remove('income')

# Apply one-hot encoding with the updated parameter
# Use 'sparse_output=False' if you're using scikit-learn 1.2 or newer.
# This will ensure your code is forward-compatible.
one_hot_encoder = OneHotEncoder(sparse_output=False)
categorical_data_encoded = one_hot_encoder.fit_transform(file_data[categorical_cols])

# Convert to DataFrame and add column names
encoded_columns = one_hot_encoder.get_feature_names_out(categorical_cols)
categorical_data_encoded_df = pd.DataFrame(categorical_data_encoded, columns=encoded_columns)

# Drop original categorical columns and concat encoded columns
file_data = file_data.drop(categorical_cols, axis=1).reset_index(drop=True)
file_data_encoded = pd.concat([file_data, categorical_data_encoded_df], axis=1)


In [30]:
from sklearn.model_selection import train_test_split

X = file_data_encoded.drop('income', axis=1)
y = file_data_encoded['income'].apply(lambda x: 1 if x == '>50K' else 0)  # Encoding the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

naive_base_model = GaussianNB()
naive_base_model.fit(X_train, y_train)
naive_base_predictions = naive_base_model.predict(X_test)

print(f'Naïve Bayes Accuracy: {accuracy_score(y_test, naive_base_predictions)}')
print(classification_report(y_test, naive_base_predictions))

Naïve Bayes Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6513

    accuracy                           1.00      6513
   macro avg       1.00      1.00      1.00      6513
weighted avg       1.00      1.00      1.00      6513



In [32]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_model_predictions = random_forest_model.predict(X_test)

print(f'Random Forest Accuracy: {accuracy_score(y_test, random_forest_model_predictions)}')
print(classification_report(y_test, random_forest_model_predictions))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6513

    accuracy                           1.00      6513
   macro avg       1.00      1.00      1.00      6513
weighted avg       1.00      1.00      1.00      6513



### Evaluation Metrics

In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluation for Naïve Bayes
naive_bayes_accuracy = accuracy_score(y_test, naive_base_predictions)
naive_bayes_precision = precision_score(y_test, naive_base_predictions)
naive_bayes_recall = recall_score(y_test, naive_base_predictions)
naive_bayes_f1 = f1_score(y_test, naive_base_predictions)
# naive_bayes_auc = roc_auc_score(y_test, naive_base_predictions)

print("Naïve Bayes model Performance \n")
print(f"Accuracy: {naive_bayes_accuracy:.4f}")
print(f"Precision: {naive_bayes_precision:.4f}")
print(f"Recall: {naive_bayes_recall:.4f}")
print(f"F1 Score: {naive_bayes_f1:.4f}")
# print(f"AUC: {naive_bayes_auc:.4f}\n")

Naïve Bayes Performance 

Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluation for Random Forest
random_forest_accuracy = accuracy_score(y_test, random_forest_model_predictions)
random_forest_precision = precision_score(y_test, random_forest_model_predictions)
random_forest_recall = recall_score(y_test, random_forest_model_predictions)
random_forest_f1 = f1_score(y_test, random_forest_model_predictions)
# random_forest_auc = roc_auc_score(y_test, random_forest_model_predictions)

print("Random Forest model Performance \n")
print(f"Accuracy: {random_forest_accuracy:.4f}")
print(f"Precision: {random_forest_precision:.4f}")
print(f"Recall: {random_forest_recall:.4f}")
print(f"F1 Score: {random_forest_f1:.4f}")
# print(f"AUC: {random_forest_auc:.4f}")

Random Forest model Performance 

Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
