# EXTRA CREDIT HOMEWORK 

In [115]:
from __future__ import print_function, division
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import multivariate_normal
import seaborn as sns; sns.set()
from demo import fairness_demo

from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Problem 1  [25 points]
Consider the four datasets returned from the function get_dataset($d$) for $d=2,3,4, \& \;5$. Here $d$ is the dimensionality of the non-sensitive covariates, which are returned in the matrix $X$, whereas the vectors $y$ and $x_{\rm{sensitive}}$ store the target labels and sensitive covariate, respectively. (As such the structure of the data is exactly analagous to what we have in the fairness demo notebook from the lecture.)

In [14]:
def get_gaussian_data(mean, cov, class_label, n_samples):
    nv = multivariate_normal(mean = mean, cov = cov)
    X = nv.rvs(n_samples)
    y = np.ones(n_samples, dtype=float) * class_label
    return X,y

### this function returns the dataset
### e.g. X, y, x_sensitive = get_dataset(2)

def get_dataset(d):
    np.random.seed(5)
    mu1 = 0.5*np.ones(d)
    mu2 = -0.5*np.ones(d)
    sigma1 = np.eye(d)
    sigma2 = np.eye(d)
    X1, y1 = get_gaussian_data(mu1, sigma1, 1, 10000*d) # positive class
    X2, y2 = get_gaussian_data(mu2, sigma2, -1, 10000*d) # negative class

    X = np.vstack((X1, X2)) # non-sensitive covariates
    y = np.hstack((y1, y2)) # class labels
    x_sensitive = np.ones(X.shape[0])
    x_sensitive[X[:,0]<0.0] = 0 # sensitive covariate; 
                                # 0 is the protected class, 1 is the non-protected class
    return X, y, x_sensitive

# part a)  [10 points]
Using the logistic regression classifier provided by the python class fairness_demo (just like in the lecture notebook) calculate the accuracy and p%-rule ratio for all four datasets using the unconstrained classifier (i.e. no fairness constraints are imposed).

# part b) [10 points]
Note that for all four datasets the "four-fifths rule" is very much not satisfied. For each dataset impose the minimum fairness constraint such that the four-fiths rule is satisfied. What is the loss in accuracy for each dataset as compared to the unconstrained classifier performance?

# part c) [5 points]
Notice that (as least as far as the four datasets for $d=2,3,4, \& \;5$ are concerned) as the dimension $d$ increases  the following things happen:

- the accuracy increases
- the p%-rule ratio for the unconstrained classifier increases
- the accuracy losses as calculated in part b decrease (at least approximately up to fluctuations)

Look at the function get_dataset($d$) and consider how the generated dataset changes as a function of $d$. Do you expect the behavior described above to continue for all values of $d>5$? If so, explain why. If not, explain why not.
<br><br>

## (a)

In [30]:
def train_unconstrained(X,y,x_sensitive):
    fd_unconstrained=fairness_demo()
    w_unconstrained, p_rule_unconstrained, accuracy_unconstrained = fd_unconstrained.train(X,y,x_sensitive,-1.0)
    print ("accuracy = ", accuracy_unconstrained)
    print ("p%-rule ratio = ", p_rule_unconstrained)

d = [2,3,4,5]

for i in d:
    print ("when d =", i)
    train_unconstrained(get_dataset(i)[0],get_dataset(i)[1],get_dataset(i)[2])
    print ("")
    

when d = 2
accuracy =  75.6025
p%-rule ratio =  26.794234903

when d = 3
accuracy =  80.7233333333
p%-rule ratio =  33.6578812069

when d = 4
accuracy =  84.12375
p%-rule ratio =  36.8799868285

when d = 5
accuracy =  86.947
p%-rule ratio =  39.2738406591



## (b)

In [28]:
def train_constrained(X,y,x_sensitive,fairness_constraint):
    
    fd_unconstrained=fairness_demo()
    w_unconstrained, p_rule_unconstrained, accuracy_unconstrained = fd_unconstrained.train(X,y,x_sensitive,-1.0)
    
    fd_constrained=fairness_demo()
    w_constrained, p_rule_constrained, accuracy_constrained = fd_constrained.train(X,y,x_sensitive,fairness_constraint)
    
    print ("p%-rule ratio constrained =", p_rule_constrained)
    print ("accuracy constrained = ", accuracy_constrained)
    print ("accuracy lose = ", accuracy_unconstrained-accuracy_constrained)
    
for i in d:
    print ("when d =", i)
    train_constrained(get_dataset(i)[0],get_dataset(i)[1],get_dataset(i)[2],0.01)
    print ("")

when d = 2
p%-rule ratio constrained = 94.3642038571
accuracy constrained =  65.2675
accuracy lose =  10.335

when d = 3
p%-rule ratio constrained = 93.5174069628
accuracy constrained =  70.7333333333
accuracy lose =  9.99

when d = 4
p%-rule ratio constrained = 91.3127257692
accuracy constrained =  74.12125
accuracy lose =  10.0025

when d = 5
p%-rule ratio constrained = 89.4580749414
accuracy constrained =  76.831
accuracy lose =  10.116



## (c) Answer:

I think when d is more than 5, the three condition will keep the same but the speed of increase or decrease will be slow and the value will be similar. The reason is that the gap between protected value and unprotect value is getting smaller and smaller when dimension increase. 

# Problem 2  [10 points]

Read the following review by Barocas and Selbst (or as much of the review as you find interesting):

http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2477899

Answer the following questions on the basis of what you've read.

# part a)

Consider the logic underlying the "fairness aware" classifier we explored in the previous problem. Consider the principle of nondiscrimination versus the principle of antisubordination. Which (if any) of the two principles is more in line with the approach taken by the algorithm? Why?

# part b)

Consider the "fairness aware" classifier we explored in the previous problem. In itself does it offer a solution to the problem of "masking" as described in the review?
<br><br>

## (a) Answer: 
It is stated that nondiscrimination aims to eliminate the unfairness between individuals due to the decision makers choose in a protected group. On the contrary, anti-subordination theory preserve the principles in a core perspective that try to eliminate the individuals status on being a member of different groups. Both of them try to decrease the gap between the elimination of the individuals from their group features. Once the gap between the protected and the unprotected values become closer.

## (b) Answer:

It is stated that 'masks efforts to engage in intentional discrimination by abstracting to a level of analysis that fails to capture lower level variations that might otherwise make certain members of protected classes into more attractive candidates.' It shows us that masking provide to ignore lower level changes or variations in data. It tries to eliminate fluctuation in data. It also enables to gap between the protected and unprotected value discrimination.

# Problem 3  [5 points]

Consider the following illustration of a dataset in which the positive target labels are marked with plus signs, the green points constitute the non-protected class, and the blue points constitute the protected class. The distribution of the non-protected class is illustrated on the left, the distribution of the protected class is illustrated in the middle, and the graphic on the right shows the combined dataset.

<img src="dataset.png">

Consider applying the "fairness aware" classifier in Problem 1 to the combined dataset, imposing fairness constraints such that the four-fifths rule is satisfied. Do you expect the loss of accuracy as you go from the unconstrained to the constrained classifier to be large or small? Why?


<br><br>

## Answer:

In the satisfaction of fairness or four-fifths rule and relte them with the given illustrated figure, it is obvious that majority and minority and overall population has different dynamics. In this perspective when thinking about the accuracy and constrained or unconstrained calssifiers, some biases in itself creates an accuracy problem. However loss of accuracy will decrease.

# Problem 4  [15 points]
#### (This is question 2.2 from Dunning's book)

In a study of the effect of police presence on the incidence of crime, Di Tella and Schargrodsky (2004) write:

“Following a terrorist attack on the main Jewish center in Buenos Aires, Argentina, in July
1994, all Jewish institutions received police protection… Because the geographical
distribution of these institutions can be presumed to be exogenous in a crime regression, this hideous event constitutes a natural experiment.”

The authors find that blocks which were allocated extra police forces due to the presence of a Jewish institution experienced lower motor vehicle theft rates.  The control group consists of blocks in the same neighborhoods that do not have Jewish institutions.

Answer the following three questions __in at least 6-10 sentences__.

### part a) 

What do the authors mean by “presumed exogenous in a crime regression” and what is the relationship to as-if random assignment?  
### part b) 
What are some potential threats to as-if random assignment?  [give at least two examples of potential threats]
### part c) 
How might these threats be evaluated empirically?
<br><br><br><br>

## Answer:

Presumed Exogenous will preserve some royality interms of the data analysis period or creating th perior beliefs to the data. This adresses to the selecting data in a predefined condition and their affects on the categorization. In addition to that in a af -if scenerio there will be real selected group and another group for validating it. The aim will be creating relation between the studied and tested group. However there will be some error in validation process so as if scenerio has includes some biases in its nature. As a result, controled data can be indicator of assesign the succes of the data which is selected as a as-if method.



# Problem 5  [45 points]
Consider the Titanic dataset below

In [125]:
data=pd.read_csv("https://serv.cusp.nyu.edu/classes/ML_2016_Spring/Bonus/titanic3.csv");
print("Here are the first three rows:")
data.iloc[0:3,:]

Here are the first three rows:


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Data dictionary
NAME: titanic3<br>
SIZE: 1309 Passengers, 14 Variables<br><br>

VARIABLE DESCRIPTIONS<br>
Pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) <br>
survival: Survival (0 = No; 1 = Yes)<br>
name: Name<br>
sex: Sex<br>
age: Age<br>
sibsp: Number of Siblings/Spouses Aboard<br>
parch: Number of Parents/Children Aboard<br>
ticket: Ticket Number<br>
fare: Passenger Fare (British pound)<br>
cabin: Cabin<br>
embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)<br>
boat: Lifeboat<br>
body: Body Identification Number<br>
home.dest: Home/Destination

# part a) [30 points]

Your goal is to train a classifier for the binary attribute “survival" using age, sex, pclass, sibsp, and parch as features. You will do so using three different machine learning techniques:

i) Naive Bayes Classification. [10 points]

ii) Support Vector Machine. Try a linear SVM with soft margins as well as kernel SVM with polynomial and Gaussian kernels. Make sure to use a validation set to choose hyperparameters for each model where applicable. [10 points]

iii) Random Forest Classification. [10 points]

For each of the three models report out-of-sample accuracy--in order to do so, you will of course need to split the dataset into a training dataset and a test dataset.

# part b)  [15 points]

Repeat the exercise in part a, this time using cross validation. Report the mean accuracy for each model after doing 10 random splits of the data into train and test sets.

## (a)

In [126]:
data = data.dropna(subset=['age'])

target = data['survived']

X = data[['age', 'sex', 'pclass', 'sibsp', 'parch']]
X = pd.get_dummies(X)
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size=0.3, random_state=1)

In [131]:
# Naive Bayes Classification

gnb = GaussianNB()
gnb.fit(X_train, target_train)
pred = gnb.predict(X_test)

print("Accuracy by Naive Bayes =", 1.0*sum(target_test==pred)/len(pred))

Accuracy by Naive Bayes = 0.837579617834


In [128]:
# SVM

svc = SVC(kernel='linear')
svc.fit(X_train, target_train)
pred = svc.predict(X_test)
print("Accuracy by SVM with linear =", 1.0*sum(target_test==pred)/len(pred))

svc = SVC(kernel='poly', max_iter=-1, tol=.01, degree=2)
svc.fit(X_train, target_train)
pred = svc.predict(X_test)
print("Accuracy by SVM with poly =", 1.0*sum(target_test==pred)/len(pred))

svc = SVC(kernel='rbf')
svc.fit(X_train, target_train)
pred = svc.predict(X_test)
print("Accuracy by SVM with rbf =", 1.0*sum(target_test==pred)/len(pred))

Accuracy by SVM with linear = 0.828025477707
Accuracy by SVM with poly = 0.859872611465
Accuracy by SVM with rbf = 0.837579617834


In [129]:
# Random Forest

clf = RandomForestClassifier(n_jobs=-1, n_estimators=1000)
clf = clf.fit(X_train, target_train)
pred = clf.predict(X_test)
print("Accuracy by Random Forest =", 1.0*sum(target_test==pred)/len(pred))

Accuracy by Random Forest = 0.792993630573


## (b)

In [132]:
for clf, name in [
    (GaussianNB(), 'Naive Bayes'), 
    (SVC(kernel='linear'), 'SVM Linear'),
    (SVC(kernel='poly', tol=.1, degree=2), 'SVM Poly'),
    (SVC(kernel='rbf'), 'SVM RBF'),
    (RandomForestClassifier(n_jobs=-1, n_estimators=500), 'Random Forest')
]:
    scores = cross_val_score(clf, X, target, cv=10, scoring='accuracy')
    print("Average Accuracy of", name, "=", scores.mean())

Average Accuracy of Naive Bayes = 0.783547779082
Average Accuracy of SVM Linear = 0.778813346847
Average Accuracy of SVM Poly = 0.804610761407
Average Accuracy of SVM RBF = 0.781606031509
Average Accuracy of Random Forest = 0.735707173086
