<a href="https://colab.research.google.com/github/MpRonald/Machine-Learning/blob/main/Attributes_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
census = pd.read_csv('https://raw.githubusercontent.com/MpRonald/datasets/main/census.csv')
census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
col = census.columns[:-1]
col

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [5]:
X_census = census.iloc[:,0:14].values
y_census = census.iloc[:,14].values

In [6]:
# label encoder
lb = LabelEncoder()
lb_work = lb
lb_education = lb
lb_marital = lb
lb_occupation = lb
lb_relationship = lb
lb_race = lb
lb_sex = lb
lb_country = lb

In [7]:
X_census[:,1] = lb_work.fit_transform(X_census[:,1])
X_census[:,3] = lb_education.fit_transform(X_census[:,3])
X_census[:,5] = lb_marital.fit_transform(X_census[:,5])
X_census[:,6] = lb_occupation.fit_transform(X_census[:,6])
X_census[:,7] = lb_relationship.fit_transform(X_census[:,7])
X_census[:,8] = lb_race.fit_transform(X_census[:,8])
X_census[:,9] = lb_sex.fit_transform(X_census[:,9])
X_census[:,13] = lb_country.fit_transform(X_census[:,13])

In [8]:
scaler = MinMaxScaler()
X_census_scaler = scaler.fit_transform(X_census)

# Low Variance

In [9]:
for i in range(X_census.shape[1]):
    print(X_census_scaler[:,i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [10]:
# bellow we're looking for variance among variables
selection = VarianceThreshold(threshold=0.05)
X_var = selection.fit_transform(X_census_scaler)
X_var.shape

(32561, 5)

In [11]:
X_var

array([[0.6       , 0.66666667, 0.07142857, 0.2       , 1.        ],
       [0.6       , 0.33333333, 0.28571429, 0.        , 1.        ],
       [0.73333333, 0.        , 0.42857143, 0.2       , 1.        ],
       ...,
       [0.73333333, 1.        , 0.07142857, 0.8       , 0.        ],
       [0.73333333, 0.66666667, 0.07142857, 0.6       , 1.        ],
       [0.73333333, 0.33333333, 0.28571429, 1.        , 0.        ]])

In [12]:
# this ceel make the same thing that other cells above
selection.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [13]:
idx = np.where(selection.variances_ > 0.05)
idx

(array([3, 5, 6, 7, 9]),)

In [14]:
# here we see the columns more important to model
col[idx]

Index(['education', 'marital-status', 'occupation', 'relationship', 'sex'], dtype='object')

In [15]:
# next, we'll go train a new model only this volumns above
census_var = census[['education', 'marital-status', 'occupation', 'relationship', 'sex', 'income']]
census_var.head()

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K


In [16]:
type(census_var)

pandas.core.frame.DataFrame

In [17]:
X = census_var.iloc[:,0:5].values
y = census_var.iloc[:,5].values

In [18]:
# label encoder
lb = LabelEncoder()
lb_education = lb
lb_marital = lb
lb_occupation = lb
lb_relationship = lb
lb_sex = lb

In [19]:
X[:,0] = lb_education.fit_transform(X[:,0])
X[:,1] = lb_marital.fit_transform(X[:,1])
X[:,2] = lb_occupation.fit_transform(X[:,2])
X[:,3] = lb_relationship.fit_transform(X[:,3])
X[:,4] = lb_sex.fit_transform(X[:,4])

In [20]:
X

array([[9, 4, 1, 1, 1],
       [9, 2, 4, 0, 1],
       [11, 0, 6, 1, 1],
       ...,
       [11, 6, 1, 4, 0],
       [11, 4, 1, 3, 1],
       [11, 2, 4, 5, 0]], dtype=object)

In [21]:
# one hot encoder
one_hot = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), 
                                          [0,1,2,3,4])], 
                                          remainder='passthrough')

In [22]:
X = one_hot.fit_transform(X).toarray()
X

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [23]:
scaler = MinMaxScaler()
X_scaler = scaler.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=123)

In [25]:
X_train.shape, y_train.shape

((26048, 46), (26048,))

In [26]:
X_test.shape, y_test.shape

((6513, 46), (6513,))

In [27]:
random_var = RandomForestClassifier(criterion='entropy', min_samples_split=5, n_estimators=100)
random_var.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=5)

In [28]:
y_pred = random_var.predict(X_test)
accuracy_score(y_test, y_pred)

0.8198986642100414

# Extra Tree Classifier

In [29]:
select = ExtraTreesClassifier().fit(X_census_scaler, y_census)

In [30]:
# bellow we're checking the more important attribute 
importance = select.feature_importances_
print(importance)

[0.15440021 0.04505816 0.16413943 0.03674252 0.08850698 0.0749592
 0.0765165  0.08817046 0.01490339 0.02710244 0.09117964 0.0282735
 0.09297083 0.01707675]


In [31]:
index = []
for i in range(len(importance)):
    if importance[i] >= 0.05:
        index.append(i)

In [32]:
index

[0, 2, 4, 5, 6, 7, 10, 12]

In [33]:
col[index]

Index(['age', 'final-weight', 'education-num', 'marital-status', 'occupation',
       'relationship', 'capital-gain', 'hour-per-week'],
      dtype='object')

In [34]:
X_census_extra = X_census[:, index]

In [35]:
# one hot encoder
one_hot = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), 
                                          [5,6,7])], 
                                          remainder='passthrough')

In [36]:
X = one_hot.fit_transform(X_census_extra).toarray()
X

array([[ 0.,  1.,  0., ..., 13.,  4.,  1.],
       [ 1.,  0.,  0., ..., 13.,  2.,  4.],
       [ 0.,  1.,  0., ...,  9.,  0.,  6.],
       ...,
       [ 0.,  0.,  0., ...,  9.,  6.,  1.],
       [ 0.,  0.,  0., ...,  9.,  4.,  1.],
       [ 0.,  0.,  0., ...,  9.,  2.,  4.]])

In [37]:
X.shape

(32561, 224)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [39]:
X_train.shape, y_train.shape

((26048, 224), (26048,))

In [40]:
random_var = RandomForestClassifier(criterion='entropy', min_samples_split=5, n_estimators=100)
random_var.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=5)

In [41]:
y_pred = random_var.predict(X_test)
accuracy_score(y_test, y_pred)

0.8518347919545525