# Assignment 1B by Group 28
John Lindblad and Olle Lindgren

In [1]:
# imports
import pandas as pd
import pprint

# scikit learn tools
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# scikit learn classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz

## Task 1: Working with a dataset with categorical features

### Step 1: Reading the data

In [2]:
adult_train = pd.read_csv('adult_train.csv')
adult_test = pd.read_csv('adult_test.csv')
#print(adult_train)
features = ['age', 'workclass', 'fnlwgt','education', 'education-num', 
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
target = 'target'
Xtrain = adult_train[features]
#print(Xtrain)
Ytrain = adult_train[target]
#print(Ytrain)
Xtest = adult_test[features]
#print(Xtest)
Ytest = adult_test[target]
#print(Ytest)

### Step 2: Encoding the features as numbers

In [3]:
#convert into lists of dictionaries
Xtrain_dict = Xtrain.to_dict('records')
#print(Xtrain_dict[:5])
Xtest_dict = Xtest.to_dict('records')
#print(Xtest_dict[:5])
#Yrain_dict = Ytrain.to_dict('records')

# make and apply a DictVectorizer to training data
dv = DictVectorizer()
Xtrain_encoded = dv.fit_transform(Xtrain_dict)
#print(Xtrain_encoded)
#print(type(Xtrain_encoded))

# apply the same DictVectorizer to the test data
Xtest_encoded = dv.fit_transform(Xtest_dict)
#print(Xtrain_encoded)
#print(type(Xtrain_encoded))

In [4]:
Ytrain_dict = Xtrain.to_dict('records')
Ytrain_encoded = dv.fit_transform(Ytrain_dict)

### Step 3: Combining the steps 

In [5]:
# building a pipeline
pipeline = make_pipeline(
    DictVectorizer(),
    DecisionTreeClassifier()
)

In [6]:
# testing the pipeline

#pipeline.score(Xtrain_dict, Ytrain)
# it works but I have no idea what the score tells me

pipeline.fit(Xtrain_dict, Ytrain)
# no errors given, I assume that it works

Pipeline(memory=None,
         steps=[('dictvectorizer',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'))],
         verbose=False)

## Task 2: Decision trees and random forests

### Underfitting and overfitting in decision tree classifiers

In [9]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(Xtrain_encoded, Ytrain)
cvs_tree = cross_val_score(clf_tree, Xtrain_encoded, Ytrain, cv=5, error_score='raise').mean()
print(f'The cross-validation score on the test data is: {cvs_tree}')

The cross-validation score on the test data is: 0.816651840903338


In [10]:
clf_tree.predict(Xtest_encoded)

ValueError: Number of features of the model must match the input. Model n_features is 108 and input n_features is 107 

### Underfitting and overfitting in random forest classifiers

In [11]:
clf_forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf_forest.fit(Xtrain_encoded, Ytrain)
cvs_forest = cross_val_score(clf_forest, Xtrain_encoded, Ytrain, cv=5, error_score='raise').mean()
print(f'The cross-validation score on the test data is: {cvs_forest}')

The cross-validation score on the test data is: 0.8562084487234187


In [12]:
clf_forest.predict(Xtest_encoded)

ValueError: Number of features of the model must match the input. Model n_features is 108 and input n_features is 107 

## Task 3: Feature importance in random forest classifiers

The code below prints a sorted list of the importance scores for all features starting with the most important one.

In [28]:
# for the random forest classifier

importances = clf_forest.feature_importances_
names = dv.feature_names_
output = list(zip(names, importances))
#print(output)
output_sorted = sorted(output, key=lambda tup: tup[1], reverse=True)
pp = pprint.PrettyPrinter()
pp.pprint(output_sorted)

[('fnlwgt', 0.16317149714170215),
 ('age', 0.15054726082921696),
 ('capital-gain', 0.09637212074042766),
 ('hours-per-week', 0.08091017186889544),
 ('marital-status=Married-civ-spouse', 0.06585321397601554),
 ('education-num', 0.06434497224560137),
 ('relationship=Husband', 0.036384604689572976),
 ('capital-loss', 0.029816983326129975),
 ('marital-status=Never-married', 0.02216510335421959),
 ('occupation=Exec-managerial', 0.018619016184303033),
 ('occupation=Prof-specialty', 0.014394471097159417),
 ('sex=Male', 0.013966717817257761),
 ('education=Bachelors', 0.012165048038995522),
 ('relationship=Own-child', 0.012091939035980034),
 ('relationship=Not-in-family', 0.010316620015908457),
 ('workclass=Private', 0.010261330515381093),
 ('relationship=Wife', 0.009552587483594747),
 ('education=Masters', 0.00921115957579494),
 ('education=HS-grad', 0.008083910824218154),
 ('workclass=Self-emp-not-inc', 0.007862014244281916),
 ('sex=Female', 0.007333877544942146),
 ('occupation=Sales', 0.0068

In [32]:
# for the decision tree classifier

importances = clf_tree.feature_importances_
names = dv.feature_names_
output = list(zip(names, importances))
#print(output)
output_sorted = sorted(output, key=lambda tup: tup[1], reverse=True)
pp = pprint.PrettyPrinter()
pp.pprint(output_sorted)

[('marital-status=Married-civ-spouse', 0.19777128189698667),
 ('fnlwgt', 0.1859276956949833),
 ('education-num', 0.1154073860373249),
 ('age', 0.10884294239565086),
 ('capital-gain', 0.10542664735421763),
 ('hours-per-week', 0.06710770995490783),
 ('capital-loss', 0.03805798444297408),
 ('workclass=Private', 0.009601909781509264),
 ('occupation=Exec-managerial', 0.009564695509640155),
 ('occupation=Sales', 0.009537340293728267),
 ('workclass=Self-emp-not-inc', 0.009006343712499419),
 ('occupation=Prof-specialty', 0.007461160132492061),
 ('occupation=Craft-repair', 0.006966715115477867),
 ('occupation=Machine-op-inspct', 0.0057508687350900185),
 ('workclass=Local-gov', 0.005562530825141868),
 ('occupation=Transport-moving', 0.005306526229640213),
 ('occupation=Adm-clerical', 0.004885297665694993),
 ('workclass=Self-emp-inc', 0.004769948897851323),
 ('race=White', 0.004742232282809435),
 ('relationship=Wife', 0.004522885607051046),
 ('race=Black', 0.004473846575691656),
 ('workclass=Stat