# Adult Income Dataset

In [28]:
import numpy as np 
import pandas as pd 
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [29]:
data = pd.read_csv("../../InputData/AdultDataset/adult.csv")
data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


## Data Cleaning

In [31]:
# find columns with "?" value
col_names = data.columns
num_data = data.shape[0]
for c in col_names:
    num_non = data[c].isin(["?"]).sum()
    if num_non > 0:
        print (c)
        print (num_non)
        print ("{0:.2f}%".format(float(num_non) / num_data * 100))
        print ("\n")

workclass
2799
5.73%


occupation
2809
5.75%


native-country
857
1.75%




In [32]:
data = data[data["workclass"] != "?"]
data = data[data["occupation"] != "?"]
data = data[data["native-country"] != "?"]

data.shape


(45222, 15)

In [34]:
data.to_csv(r"../../InputData/AdultDataset/ForClassification/CleanAdult.csv")

In [35]:
# change categorical values to numeric
# education and education-num are the same, one-to-one mapping
category_col =['workclass', 'race', 'education','marital-status', 'occupation',
               'relationship', 'gender', 'native-country', 'income'] 

for col in category_col:
    b, c = np.unique(data[col], return_inverse=True) 
    data[col] = c

data.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,1
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,38,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,38,0


In [48]:
data.to_csv(r"../../InputData/AdultDataset/ForClassification/CleanAdult_numerical.csv", index=False)


## Prediction

In [37]:
predictors = ['age','workclass','education','educational-num',
              'marital-status', 'occupation','relationship','race','gender',
              'capital-gain','capital-loss','hours-per-week', 'native-country']

high_income = data[data['income'] == 1]
low_income = data[data['income'] == 0]



X_train, X_test, y_train, y_test = train_test_split(data[predictors], data['income'], test_size=0.5, random_state=1)




In [38]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [39]:
num = 0
for index, row in X_test.iterrows():
    if clf.predict([row.to_list()]) != y_test.loc[index]:
        num = num + 1
        print ('-----------------')
        print (str(num) + '\n')
        print (row)
        
print(num)
print((1-num/len(X_test))*100)



-----------------
1

age                  40
workclass             2
education            15
educational-num      10
marital-status        4
occupation            3
relationship          1
race                  4
gender                0
capital-gain          0
capital-loss       1340
hours-per-week       40
native-country       38
Name: 9107, dtype: int64
-----------------
2

age                43
workclass           2
education          11
educational-num     9
marital-status      2
occupation         13
relationship        0
race                1
gender              1
capital-gain        0
capital-loss        0
hours-per-week     99
native-country     38
Name: 35162, dtype: int64
-----------------
3

age                34
workclass           2
education          11
educational-num     9
marital-status      2
occupation          2
relationship        0
race                4
gender              1
capital-gain        0
capital-loss        0
hours-per-week     50
native-country     38
Na

In [40]:
X_test_res = X_test.copy()

In [41]:
X_test_res['act'] = y_test

In [42]:
X_test_res

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,act
42093,64,2,15,10,3,3,4,4,0,0,0,40,38,0
31739,56,2,6,5,0,0,1,4,1,0,0,40,38,0
10773,36,2,6,5,2,5,0,4,1,0,0,60,38,0
15850,19,2,11,9,4,5,1,2,1,0,0,37,38,0
40645,32,2,7,12,2,9,0,4,1,0,0,45,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34408,36,2,11,9,0,6,4,4,1,0,0,40,38,0
13238,41,4,15,10,2,2,0,4,1,0,0,40,38,0
31432,27,0,7,12,2,0,0,4,1,4064,0,40,7,0
19093,40,2,11,9,2,11,5,4,0,5013,0,20,38,0


In [43]:
X_test_res['pred'] = X_test.apply(lambda x: clf.predict([x.to_list()]), axis=1)

In [44]:
mis_class=X_test_res.loc[X_test_res['act']!=X_test_res['pred']]

In [45]:
mis_class.describe()

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,act
count,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0
mean,43.093088,2.245161,10.697926,10.786175,2.127189,6.293779,0.842857,3.736636,0.812903,205.405069,52.154147,44.446083,36.249309,0.496774
std,11.101165,1.068214,3.183306,2.404362,1.07495,4.147445,1.588507,0.785613,0.390034,2076.692283,310.142974,11.73044,6.520064,0.500047
min,18.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
25%,35.0,2.0,9.0,9.0,2.0,3.0,0.0,4.0,1.0,0.0,0.0,40.0,38.0,0.0
50%,42.0,2.0,11.0,10.0,2.0,6.0,0.0,4.0,1.0,0.0,0.0,40.0,38.0,0.0
75%,51.0,2.0,12.0,13.0,2.0,10.0,1.0,4.0,1.0,0.0,0.0,50.0,38.0,1.0
max,90.0,6.0,15.0,16.0,6.0,13.0,5.0,4.0,1.0,99999.0,3175.0,99.0,39.0,1.0


In [47]:
# mis-classified data
mis_class.to_csv(r"../../InputData/AdultDataset/ForClassification/CleanAdult_numerical_mis.csv", index=False)


## show case of a group

In [15]:
X_test_min = X_test[(X_test['workclass'] == 2) & 
                    (X_test['race'] == 4) & 
                    (X_test['gender'] == 1)& 
                    (X_test['educational-num'] == 9)& 
                    (X_test['native-country'] == 38)]

In [16]:
mis_class_min = mis_class[(mis_class['workclass'] == 2) & 
                          (mis_class['race'] == 4) & 
                          (mis_class['gender'] == 1) &  
                          (mis_class['educational-num'] == 9)& 
                          (mis_class['native-country'] == 38)]

In [17]:
len(mis_class_min)

632

In [18]:
X_test_min

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
16139,41,2,11,9,2,11,0,4,1,0,0,62,38
34252,34,2,11,9,0,11,1,4,1,0,1669,45,38
30482,30,2,11,9,0,11,4,4,1,1151,0,30,38
9231,34,2,11,9,2,2,0,4,1,0,0,50,38
40753,40,2,11,9,2,11,0,4,1,0,0,40,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43111,53,2,11,9,2,3,0,4,1,0,0,40,38
42923,36,2,11,9,2,3,0,4,1,0,0,40,38
30778,24,2,11,9,4,3,4,4,1,0,0,40,38
47207,56,2,11,9,2,5,0,4,1,0,0,18,38


In [19]:
len(mis_class)

4298

In [20]:
acc = (1-len(mis_class)/len(X_test))*100

In [21]:
acc

80.99155278404317

In [22]:
X_test_min

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
16139,41,2,11,9,2,11,0,4,1,0,0,62,38
34252,34,2,11,9,0,11,1,4,1,0,1669,45,38
30482,30,2,11,9,0,11,4,4,1,1151,0,30,38
9231,34,2,11,9,2,2,0,4,1,0,0,50,38
40753,40,2,11,9,2,11,0,4,1,0,0,40,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43111,53,2,11,9,2,3,0,4,1,0,0,40,38
42923,36,2,11,9,2,3,0,4,1,0,0,40,38
30778,24,2,11,9,4,3,4,4,1,0,0,40,38
47207,56,2,11,9,2,5,0,4,1,0,0,18,38


In [23]:
X_test_res_min =  X_test_res[(X_test_res['workclass'] == 2) & 
                          (X_test_res['race'] == 4) & 
                          (X_test_res['gender'] == 1)]

In [24]:
X_test_res_min_mis_class = X_test_res_min.loc[X_test_res_min['act']!=X_test_res_min['pred']]

In [25]:
X_test_res_min_mis_class

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,act,pred
9231,34,2,11,9,2,2,0,4,1,0,0,50,38,1,[0]
42467,33,2,12,14,2,9,0,4,1,0,0,45,38,1,[0]
40753,40,2,11,9,2,11,0,4,1,0,0,40,38,1,[0]
37051,33,2,11,9,2,11,0,4,1,0,1672,55,38,0,[1]
46327,29,2,9,13,2,11,0,4,1,0,0,40,38,0,[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17586,37,2,9,13,2,6,0,4,1,0,0,43,38,1,[0]
461,36,2,11,9,2,2,0,4,1,0,0,40,32,0,[1]
42946,66,2,15,10,2,13,0,4,1,0,1258,20,38,0,[1]
1014,48,2,11,9,2,6,0,4,1,0,0,40,5,0,[1]


In [26]:
min_acc = (1 - len(mis_class_min)/len(X_test_min))*100

In [27]:
min_acc

79.99366888255777

## COMPAS

In [28]:
compas_data = pd.read_csv("RecidivismData_Original-categorized.csv")
compas_data.head(10)

Unnamed: 0,name,FirstName,LastName,sex,DateOfBirth,age,race,juv_fel_count,decile_score,juv_misd_count,...,end,event,two_year_recid,Violence_score,Recidivism_score,MarriageStatus,sexC,ageC,raceC,MC
0,miguel hernandez,miguel,hernandez,Male,4/18/1947,69,Other,0,1,0,...,327,0,0,-4.31,-2.78,Single,0,3,3,0
1,kevon dixon,kevon,dixon,Male,1/22/1982,34,African-American,0,3,0,...,159,1,1,-3.07,-0.76,Single,0,1,0,0
2,ed philo,ed,philo,Male,5/14/1991,24,African-American,0,4,0,...,63,0,1,-1.31,0.44,Single,0,1,0,0
3,marcu brown,marcu,brown,Male,1/21/1993,23,African-American,0,8,1,...,1174,0,0,-1.59,0.16,Single,0,1,0,0
4,bouthy pierrelouis,bouthy,pierrelouis,Male,1/22/1973,43,Other,0,1,0,...,1102,0,0,-3.23,-2.2,Married,0,2,3,1
5,marsha miles,marsha,miles,Male,8/22/1971,44,Other,0,1,0,...,853,0,0,-3.3,-1.93,Separated,0,2,3,2
6,edward riddle,edward,riddle,Male,7/23/1974,41,Caucasian,0,6,0,...,40,1,1,-2.89,-0.16,Single,0,2,1,0
7,steven stewart,steven,stewart,Male,2/25/1973,43,Other,0,4,0,...,265,0,0,-2.44,-0.72,Married,0,2,3,1
8,bo bradac,bo,bradac,Male,6/10/1994,21,Caucasian,0,3,0,...,428,1,1,-1.84,-0.79,Single,0,1,1,0
9,benjamin franc,benjamin,franc,Male,6/1/1988,27,Caucasian,0,4,0,...,857,0,0,-2.12,-0.69,Single,0,1,1,0


In [30]:
compas_data.shape

(6889, 30)

In [31]:
compas_data.columns

Index(['name', 'FirstName', 'LastName', 'sex', 'DateOfBirth', 'age', 'race',
       'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'days_b_screening_arrest', 'c_days_from_compas',
       'c_charge_degree', 'is_recid', 'is_violent_recid', 'v_decile_score',
       'priors_count_C', 'start', 'end', 'event', 'two_year_recid',
       'Violence_score', 'Recidivism_score', 'MarriageStatus', 'sexC', 'ageC',
       'raceC', 'MC'],
      dtype='object')

In [64]:
compas_data['c_days_from_compas'].value_counts()

1.0       4248
0.0        857
2.0        206
3.0         91
4.0         78
          ... 
4696.0       1
1423.0       1
2994.0       1
389.0        1
3279.0       1
Name: c_days_from_compas, Length: 486, dtype: int64

In [65]:
features = ['sexC', 'ageC','raceC', 'MC','priors_count_C','is_violent_recid', 'v_decile_score', 'Violence_score']
X = compas_data[features].copy()
y = compas_data['is_recid'].copy()

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

In [79]:
X_test

Unnamed: 0,sexC,ageC,raceC,MC,priors_count_C,is_violent_recid,v_decile_score,Violence_score
1457,0,1,0,0,4,1,5,-1.22
1380,1,1,0,0,3,0,3,-2.26
2975,0,1,0,0,21,0,9,-2.03
6704,0,1,0,0,0,0,5,-1.96
4053,1,1,0,0,0,1,4,-2.02
...,...,...,...,...,...,...,...,...
251,0,1,0,0,4,0,4,-2.18
3646,0,1,0,0,11,0,5,-1.74
5594,0,1,1,0,3,0,5,-1.82
1090,1,1,0,0,14,0,8,-1.07


In [91]:
clf = DecisionTreeClassifier(max_depth=10)
clf = clf.fit(X_train, y_train)

In [92]:
num = 0
for index, row in X_test.iterrows():
    if clf.predict([row.to_list()]) != y_test.loc[index]:
        num = num + 1
        print ('-----------------')
        print (str(num) + '\n')
        print (row)
        
print(num)
print((1-num/len(X_test))*100)

-----------------
1

sexC                1.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      3.00
is_violent_recid    0.00
v_decile_score      3.00
Violence_score     -2.26
Name: 1380, dtype: float64
-----------------
2

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      4.00
Violence_score     -2.07
Name: 2406, dtype: float64
-----------------
3

sexC                0.00
ageC                1.00
raceC               1.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -2.82
Name: 2478, dtype: float64
-----------------
4

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      8.00
Violence_score     -1.15
Name: 2973, dtype: float64
--------

-----------------
156

sexC                0.00
ageC                2.00
raceC               1.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      2.00
Violence_score     -2.88
Name: 6636, dtype: float64
-----------------
157

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      5.00
Violence_score     -1.98
Name: 1309, dtype: float64
-----------------
158

sexC                0.00
ageC                3.00
raceC               1.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -4.01
Name: 553, dtype: float64
-----------------
159

sexC                0.00
ageC                1.00
raceC               0.00
MC                  1.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      2.00
Violence_score     -2.92
Name: 4853, dtype: float64
-

Name: 5205, dtype: float64
-----------------
291

sexC                1.0
ageC                1.0
raceC               0.0
MC                  0.0
priors_count_C      1.0
is_violent_recid    0.0
v_decile_score      4.0
Violence_score     -2.2
Name: 1014, dtype: float64
-----------------
292

sexC                1.00
ageC                2.00
raceC               0.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -3.81
Name: 1409, dtype: float64
-----------------
293

sexC                0.00
ageC                1.00
raceC               1.00
MC                  0.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      7.00
Violence_score     -1.51
Name: 2654, dtype: float64
-----------------
294

sexC                0.00
ageC                1.00
raceC               0.00
MC                  1.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -3.19
Name: 32

Name: 870, dtype: float64
-----------------
422

sexC                0.00
ageC                1.00
raceC               1.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -3.29
Name: 2363, dtype: float64
-----------------
423

sexC                0.00
ageC                1.00
raceC               1.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -3.34
Name: 5579, dtype: float64
-----------------
424

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      3.00
Violence_score     -2.39
Name: 3948, dtype: float64
-----------------
425

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      8.00
is_violent_recid    0.00
v_decile_score      6.00
Violence_score     -1.59
N

-----------------
555

sexC                0.0
ageC                1.0
raceC               0.0
MC                  0.0
priors_count_C      0.0
is_violent_recid    0.0
v_decile_score      6.0
Violence_score     -1.6
Name: 3271, dtype: float64
-----------------
556

sexC                 1.00
ageC                 2.00
raceC                0.00
MC                   0.00
priors_count_C      10.00
is_violent_recid     0.00
v_decile_score       6.00
Violence_score      -1.69
Name: 2968, dtype: float64
-----------------
557

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      7.00
Violence_score     -1.31
Name: 6795, dtype: float64
-----------------
558

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      2.00
Violence_score     -2.56
Name: 5702, dtype: float64


-----------------
673

sexC                0.00
ageC                1.00
raceC               0.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      5.00
Violence_score     -1.87
Name: 2336, dtype: float64
-----------------
674

sexC                1.00
ageC                2.00
raceC               1.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -3.61
Name: 12, dtype: float64
-----------------
675

sexC                 0.00
ageC                 3.00
raceC                0.00
MC                   0.00
priors_count_C      15.00
is_violent_recid     0.00
v_decile_score       3.00
Violence_score      -2.32
Name: 597, dtype: float64
-----------------
676

sexC                 0.0
ageC                 2.0
raceC                0.0
MC                   0.0
priors_count_C      10.0
is_violent_recid     0.0
v_decile_score       1.0
Violence_score      -3.3
Name: 1021, dtype: flo

-----------------
793

sexC                0.00
ageC                1.00
raceC               2.00
MC                  0.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      3.00
Violence_score     -2.47
Name: 1427, dtype: float64
-----------------
794

sexC                 0.00
ageC                 2.00
raceC                0.00
MC                   0.00
priors_count_C      11.00
is_violent_recid     0.00
v_decile_score       5.00
Violence_score      -1.82
Name: 749, dtype: float64
-----------------
795

sexC                0.0
ageC                1.0
raceC               3.0
MC                  0.0
priors_count_C      1.0
is_violent_recid    0.0
v_decile_score      6.0
Violence_score     -1.5
Name: 83, dtype: float64
-----------------
796

sexC                0.00
ageC                1.00
raceC               3.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      6.00
Violence_score     -1.66
Name: 6270, dtype: float64
---

-----------------
933

sexC                0.00
ageC                2.00
raceC               2.00
MC                  0.00
priors_count_C      9.00
is_violent_recid    0.00
v_decile_score      6.00
Violence_score     -1.73
Name: 3289, dtype: float64
-----------------
934

sexC                1.00
ageC                0.00
raceC               0.00
MC                  0.00
priors_count_C      1.00
is_violent_recid    0.00
v_decile_score      7.00
Violence_score     -1.33
Name: 2905, dtype: float64
-----------------
935

sexC                 0.0
ageC                 1.0
raceC                0.0
MC                   0.0
priors_count_C      20.0
is_violent_recid     0.0
v_decile_score       5.0
Violence_score      -1.8
Name: 1361, dtype: float64
-----------------
936

sexC                1.00
ageC                0.00
raceC               0.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      7.00
Violence_score     -1.49
Name: 3946, dtype: float64


In [82]:
X_test

Unnamed: 0,sexC,ageC,raceC,MC,priors_count_C,is_violent_recid,v_decile_score,Violence_score
1457,0,1,0,0,4,1,5,-1.22
1380,1,1,0,0,3,0,3,-2.26
2975,0,1,0,0,21,0,9,-2.03
6704,0,1,0,0,0,0,5,-1.96
4053,1,1,0,0,0,1,4,-2.02
...,...,...,...,...,...,...,...,...
251,0,1,0,0,4,0,4,-2.18
3646,0,1,0,0,11,0,5,-1.74
5594,0,1,1,0,3,0,5,-1.82
1090,1,1,0,0,14,0,8,-1.07


In [83]:
HF_test = X_test[(X_test['raceC'] == 2) & (X_test['sexC'] == 1)]

In [75]:
HF_test

Unnamed: 0,name,FirstName,LastName,sex,DateOfBirth,age,race,juv_fel_count,decile_score,juv_misd_count,...,end,event,two_year_recid,Violence_score,Recidivism_score,MarriageStatus,sexC,ageC,raceC,MC
50,graciela quevedo,graciela,quevedo,Female,8/15/1952,63,Hispanic,0,1,0,...,800,0,0,-4.23,-2.51,Widowed,1,3,2,3
89,sharon muriente,sharon,muriente,Female,5/12/1971,44,Hispanic,0,1,0,...,955,0,0,-3.71,-2.20,Married,1,2,2,1
127,jessica ruiz,jessica,ruiz,Female,6/15/1985,30,Hispanic,0,5,0,...,37,1,1,-1.91,-0.77,Single,1,1,2,0
243,andrea rojas,andrea,rojas,Female,10/16/1984,31,Hispanic,0,2,0,...,1119,0,0,-2.88,-1.55,Married,1,1,2,1
319,helen carrillo,helen,carrillo,Female,6/9/1992,23,Hispanic,0,3,0,...,824,1,0,-2.10,-1.03,Single,1,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6718,cony bonilla,cony,bonilla,Female,6/21/1991,24,Hispanic,0,3,0,...,1117,0,0,-2.32,-1.20,Single,1,1,2,0
6736,jazmin gonzalez,jazmin,gonzalez,Female,10/12/1993,22,Hispanic,0,4,0,...,1033,0,0,-1.99,-1.01,Single,1,1,2,0
6741,priscilla chavez,priscilla,chavez,Female,3/20/1990,26,Hispanic,0,4,0,...,842,0,0,-2.40,-0.52,Single,1,1,2,0
6809,euridice mullen,euridice,mullen,Female,2/27/1966,50,Hispanic,0,3,0,...,12,1,1,-3.62,-1.30,Separated,1,2,2,2


In [84]:
HF_train = X_train[(X_train['raceC'] == 2) & (X_train['sexC'] == 1)]

In [86]:
HF_train.shape

(57, 8)

In [93]:
num = 0
for index, row in X_test.iterrows():
    if int(row['sexC']) == 1 and int(row['raceC']) == 2:
        if clf.predict([row.to_list()]) != y_test.loc[index]:
            num = num + 1
            print ('-----------------')
            print (str(num) + '\n')
            print (row)
        
print(num)
print((1-num/len(HF_test))*100)

-----------------
1

sexC                1.00
ageC                2.00
raceC               2.00
MC                  1.00
priors_count_C      2.00
is_violent_recid    0.00
v_decile_score      1.00
Violence_score     -4.13
Name: 2692, dtype: float64
-----------------
2

sexC                1.00
ageC                1.00
raceC               2.00
MC                  0.00
priors_count_C      5.00
is_violent_recid    0.00
v_decile_score      2.00
Violence_score     -2.89
Name: 1626, dtype: float64
-----------------
3

sexC                1.00
ageC                1.00
raceC               2.00
MC                  1.00
priors_count_C      3.00
is_violent_recid    0.00
v_decile_score      2.00
Violence_score     -2.63
Name: 2015, dtype: float64
-----------------
4

sexC                1.00
ageC                1.00
raceC               2.00
MC                  0.00
priors_count_C      0.00
is_violent_recid    0.00
v_decile_score      7.00
Violence_score     -1.41
Name: 910, dtype: float64
---------