In [62]:
#import dependencies 

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [63]:
# read in CSV

raw_economic_2017_data = pd.read_csv('Machine_Learning_Dataset_2015.csv')
raw_economic_2017_data.head(40)



Unnamed: 0,COU,Country,Avg. Work Hours (Annual),Avg. Wages (Annual),Mortality Causes,GDP (constant 2010 US$),GINI index (World Bank estimate),"Literacy rate, adult total (% of people ages 15 and above)",Population density (people per sq. km of land area),"Probability of dying at age 5-14 years (per 1,000 children age 5)",Year
0,AUS,Australia,1683.55,53946.26631,322108.1,1312200000000.0,0.0,0.0,3.100113,0.9,2015
1,AUT,Austria,1500.0,50373.81106,171362.0,413555000000.0,30.5,0.0,104.730793,0.9,2015
2,BEL,Belgium,1545.0,52120.10915,226377.6,509186000000.0,27.7,0.0,372.33144,0.9,2015
3,CAN,Canada,1712.0,48979.30251,532929.5,1802510000000.0,0.0,0.0,3.940449,1.1,2015
4,CHE,Switzerland,1589.5,64222.83942,139679.3,634045000000.0,32.3,0.0,209.596007,0.7,2015
5,CHL,Chile,1988.0,26759.21876,210741.0,264555000000.0,47.7,96.87413,23.889598,1.5,2015
6,CZE,Czech Republic,1756.0,23962.81256,228471.2,225493000000.0,25.9,0.0,136.589289,0.9,2015
7,DEU,Germany,1369.8,48034.59454,1856242.1,3718480000000.0,31.7,0.0,234.153669,0.8,2015
8,DNK,Denmark,1407.0,54196.8862,109631.9,343294000000.0,28.2,0.0,135.353251,0.6,2015
9,ESP,Spain,1699.6,40087.62768,849930.3,1420990000000.0,36.2,98.143257,92.952683,0.8,2015


In [64]:
# remove any columns that we don't want to use as features in this model

economic_2017_data = raw_economic_2017_data.drop(["Literacy rate, adult total (% of people ages 15 and above)",
                                                 "Country","COU","Year"], axis=1)
economic_2017_data.head()


Unnamed: 0,Avg. Work Hours (Annual),Avg. Wages (Annual),Mortality Causes,GDP (constant 2010 US$),GINI index (World Bank estimate),Population density (people per sq. km of land area),"Probability of dying at age 5-14 years (per 1,000 children age 5)"
0,1683.55,53946.26631,322108.1,1312200000000.0,0.0,3.100113,0.9
1,1500.0,50373.81106,171362.0,413555000000.0,30.5,104.730793,0.9
2,1545.0,52120.10915,226377.6,509186000000.0,27.7,372.33144,0.9
3,1712.0,48979.30251,532929.5,1802510000000.0,0.0,3.940449,1.1
4,1589.5,64222.83942,139679.3,634045000000.0,32.3,209.596007,0.7


In [65]:
# random trees can't take floats, so I'm changing all data types to integer

rev_economic_2017_data = economic_2017_data.astype('int64')


In [66]:
# assign X (data) and y (target)

X = rev_economic_2017_data.drop("Probability of dying at age 5-14 years (per 1,000 children age 5)", axis=1)
y = rev_economic_2017_data["Probability of dying at age 5-14 years (per 1,000 children age 5)"]
print(X.shape, y.shape)


(32, 6) (32,)


In [70]:
# establish variable to hold feature names (the names of the remaining columns)

feature_names = X.columns


In [67]:
# split data into training and testing 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [68]:
# create a random forest classifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)


0.75

In [69]:
# calculate feature importance

importances = rf.feature_importances_
importances


array([0.32046109, 0.21499909, 0.08024446, 0.12213576, 0.11072782,
       0.15143178])

In [72]:
# sort the features by their importance

sorted(zip(rf.feature_importances_, feature_names), reverse=True)


[(0.32046109079819385, 'Avg. Work Hours (Annual)'),
 (0.21499909034378878, 'Avg. Wages (Annual)'),
 (0.15143177940054872, 'Population density (people per sq. km of land area)'),
 (0.12213575580228783, 'GDP (constant 2010 US$)'),
 (0.11072782490961557, 'GINI index (World Bank estimate)'),
 (0.08024445874556523, 'Mortality Causes')]