<a href="https://colab.research.google.com/github/Minyyyyyyyy/Minyyyyyyyy/blob/main/MLCW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# load dataset

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
           "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

dataset = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [None]:
# cleaning
import numpy as np
from scipy import stats

# missing values
dataset.dropna(inplace=True)

# duplicate entries
dataset.drop_duplicates(inplace=True)

# converting to lowercase
dataset = dataset.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

# Remove outliers in 'age' feature
dataset = dataset[(np.abs(stats.zscore(dataset['age'])) < 3)]

print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32417 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32417 non-null  int64 
 1   workclass       32417 non-null  object
 2   fnlwgt          32417 non-null  int64 
 3   education       32417 non-null  object
 4   education-num   32417 non-null  int64 
 5   marital-status  32417 non-null  object
 6   occupation      32417 non-null  object
 7   relationship    32417 non-null  object
 8   race            32417 non-null  object
 9   sex             32417 non-null  object
 10  capital-gain    32417 non-null  int64 
 11  capital-loss    32417 non-null  int64 
 12  hours-per-week  32417 non-null  int64 
 13  native-country  32417 non-null  object
 14  income          32417 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB
None


In [None]:
# feature engineering

# dataset = pd.get_dummies(dataset, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

# dataset['net-capital'] = dataset['capital-gain'] - dataset['capital-loss']

# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# numerical_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'net-capital']
# dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])

# print(dataset.head())

dataset['total_capital'] = dataset['capital-gain'] - dataset['capital-loss']

dataset['age_group'] = pd.cut(dataset['age'], bins=[0, 25, 40, 60, 100], labels=['young', 'adult', 'middle-aged', 'elderly'])

print(dataset.head())

   age  workclass  fnlwgt  education  education-num  marital-status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0             0             0   

   hours-per-week  native-country  income  total_capital    age_group  
0              40              39       0           2174        adult  
1       

In [None]:
# test train split

X = dataset.drop('income', axis=1)
y = dataset['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:")
print(X_train.head())
print("\nTest set:")
print(X_test.head())

Training set:
       age  workclass  fnlwgt  education  education-num  marital-status  \
5063    44          4  111067         15             10               2   
26714   27          1  257124          9             13               4   
13711   56          0  154537         15             10               0   
30573   34          4  112564         11              9               2   
6454    40          4  104719         15             10               2   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
5063            3             0     4    1             0             0   
26714          14             2     4    1             0             0   
13711           0             4     4    0             0             0   
30573          14             0     4    1             0             0   
6454            6             0     4    1             0             0   

       hours-per-week  native-country  
5063               40              39  
26714     