# Classification

#### Loading and inspecting the data

In [1]:
import pandas as pd
from sklearn import preprocessing 

In [2]:
adult = pd.read_csv('adult.csv')
adult = adult.drop(['fnlwgt'], axis=1)
adult.info() # inspecting the data
adult.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   education       48842 non-null  object
 3   education-num   48842 non-null  int64 
 4   marital-status  48842 non-null  object
 5   occupation      46033 non-null  object
 6   relationship    48842 non-null  object
 7   race            48842 non-null  object
 8   sex             48842 non-null  object
 9   capitalgain     48842 non-null  int64 
 10  capitalloss     48842 non-null  int64 
 11  hoursperweek    48842 non-null  int64 
 12  native-country  47985 non-null  object
 13  class           48842 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB


(48842, 14)

#### 1. Create a table in the report stating the following information about the adult data set: 
(i) number of instances, (ii) number of missing values, (iii) fraction of missing values over all attribute values, (iv) number of instances with missing values and (v) fraction of instances with missing values over all instances.

In [3]:
# i) Number of instances:
adult.shape[0]

48842

In [4]:
# ii) Number of missing values
display(adult.isnull().sum()) #missing values per column
display(adult.isnull().sum().sum()) #total

age                  0
workclass         2799
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capitalgain          0
capitalloss          0
hoursperweek         0
native-country     857
class                0
dtype: int64

6465

In [5]:
# iii) fraction of missing values over all attributes --> Double check!!!
adult.isnull().sum() # null values per attribute
adult.columns # column names
for i in range(len(adult.columns)):
    print(adult.isnull().sum()[i]/adult.shape[0])

0.0
0.05730723557593874
0.0
0.0
0.0
0.05751197739650301
0.0
0.0
0.0
0.0
0.0
0.0
0.017546374022357807
0.0


In [6]:
# iv) number of instances with missing values
adult.shape[0]-adult.dropna().shape[0]

3620

In [7]:
# v) fraction of instances with missing values over all instances
(adult.shape[0]-adult.dropna().shape[0])/adult.shape[0]

0.07411653904426518

#### 2. Convert all 13 attributes into nominal using a Scikit-learn LabelEncoder. Then, print the set of all possible discrete values for each attribute.

In [8]:
# convert all attributes to nominal
le = preprocessing.LabelEncoder()
df = adult.apply(le.fit_transform)
df.head() # inspecting

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,6,9,12,4,0,1,4,1,1,0,2,38,0
1,3,5,9,12,2,3,0,4,1,0,0,0,38,0
2,2,3,11,8,0,5,1,4,1,0,0,2,38,0
3,3,3,1,6,2,5,0,2,1,0,0,2,38,0
4,1,3,9,12,2,9,5,2,0,0,0,2,4,0


In [20]:
# get the list of all possible discrete values for each attribute
column_names = df.columns.values.tolist()
for i in column_names:
    print(i, ':', df[i].unique())

age : [2 3 1 0 4]
workclass : [6 5 3 0 1 8 4 7 2]
education : [ 9 11  1 12  6 15  7  8  5 10 14  4  0  3 13  2]
education-num : [12  8  6 13  4  9 11 10  3 15 14  2  5  1  0  7]
marital-status : [4 2 0 3 5 1 6]
occupation : [ 0  3  5  9  7 11  2 13  4  6 12 14 10  1  8]
relationship : [1 0 5 3 4 2]
race : [4 2 1 0 3]
sex : [1 0]
capitalgain : [1 0 4 2 3]
capitalloss : [0 3 1 2 4]
hoursperweek : [2 0 3 4 1]
native-country : [38  4 22 18 41 25 34 32 15  8  1 10 19 29 21 30  3  0 36  6 24 35 13 31
  5  7  9 12  2 23 40 28 27 33 37 11 26 39 16 20 17 14]
class : [0 1]


#### 3. Ignore any instance with missing value(s) and use Scikit-learn to build a decision tree for classifying an individual to one of the <= 50K and > 50K categories. Compute the error rate of the resulting tree.

In [10]:
adult1 = adult.dropna() #dataframe without missing values 
df1 = adult1.apply(le.fit_transform)
df1.head() # dataframe with attributes as nominal

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,5,9,12,4,0,1,4,1,1,0,2,38,0
1,3,4,9,12,2,3,0,4,1,0,0,0,38,0
2,2,2,11,8,0,5,1,4,1,0,0,2,38,0
3,3,2,1,6,2,5,0,2,1,0,0,2,38,0
4,1,2,9,12,2,9,5,2,0,0,0,2,4,0


In [11]:
# split the data into training and test sets
import sklearn.model_selection as model_select
import sklearn.tree as tree
import sklearn.metrics as metrics
X_train, X_test, y_train, y_test = model_select.train_test_split( df1.iloc[:,:df1.shape[1]-1], df1['class'], random_state=0 )
M_train = len( X_train )
M_test = len( X_test )

In [12]:
# initialise the decision tree
clf = tree.DecisionTreeClassifier( random_state = 0 )

In [13]:
# fit the tree model to the training data
clf.fit( X_train, y_train )

DecisionTreeClassifier(random_state=0)

In [14]:
# predict the labels for the test set
y_hat = clf.predict( X_test )

In [15]:
# Compute the error rate
from sklearn.metrics import zero_one_loss
zero_one_loss(y_test, y_hat)

0.18105430744737305

In [21]:
# alternative way for computing the error rate
1-metrics.accuracy_score( y_test, y_hat )

0.18105430744737305

### 4. The aim of this question is to investigate two basic approaches for handling missing values. Initially, construct a smaller data set D' from the original data set D, containing (i) all instances with at least one missing value and (ii) an equal number of randomly selected instances without missing values. That is, if the number of instances with missing values is u in D, then D' should contain these u instances and additional u instances without any missing values, which are randomly selected from D. Then, using D', construct two modified data sets D'1 and D'2 to handle missing values. In particular:
#### a)construct D'1 by creating a new value "missing" for each attribute and using this value for every missing value in D', 
#### b)construct D'2 by using the most popular value for all missing values of each attribute. 

#### Train two decision trees with these two data sets and compare their error rates using instances from D for testing. Briefly comment on the obtained results.

In [17]:
# instances with at least one missing value
adult_NaN = adult[adult.isnull().any(axis=1)]
# Randomly selected 3620 rows with no missing values
adult_clean = adult.dropna().sample(adult_NaN.shape[0])
D_prime = pd.concat([adult_NaN, adult_clean]) # dataframe D'

In [18]:
## a) Constructing D1' dataset and train the decision tree

# Filling all NaN values with 'missing'
D1_prime = D_prime.fillna('missing')
## Decision tree process for D1_prime ('dataset filled with "missing" when missing value')
# Using label encoder
D1_prime_le = D1_prime.apply(le.fit_transform)
D1_prime_le.head()
# split the data into training and test sets
X1_train, X1_test, y1_train, y1_test = model_select.train_test_split( D1_prime_le.iloc[:,:D1_prime_le.shape[1]-1], D1_prime_le['class'], random_state=0 )
M1_train = len( X1_train )
M1_test = len( X1_test )
# fit the tree model to the training data
clf.fit( X1_train, y1_train )
# predict the labels for the test set
y1_hat = clf.predict( X1_test )
# Compute the error rate
zero_one_loss(y1_test, y1_hat)

0.1486187845303868

In [19]:
## b) Constructing D2' dataset and train the decision tree

# Filling all NaN values with the most popular value ('mode')
D2_prime = D_prime.copy()
D2_prime_columns_with_na = D2_prime.columns[D2_prime.isnull().any()].tolist()  # get a list of columns with null values
for column in D2_prime_columns_with_na:
    D2_prime[column].fillna(D2_prime[column].mode()[0], inplace=True)
## Decision tree process for D2_prime ('dataset filled with the <mode> when value was missing')
# Using label encoder
D2_prime_le = D2_prime.apply(le.fit_transform)
D2_prime_le.head()
# split the data into training and test sets
X2_train, X2_test, y2_train, y2_test = model_select.train_test_split( D2_prime_le.iloc[:,:D2_prime_le.shape[1]-1], D2_prime_le['class'], random_state=0 )
M2_train = len( X2_train )
M2_test = len( X2_test )
# fit the tree model to the training data
clf.fit( X2_train, y2_train )
# predict the labels for the test set
y2_hat = clf.predict( X2_test )
# Compute the error rate
zero_one_loss(y2_test, y2_hat)

0.16132596685082878