## The Data: Pima Indian Classification

### Problem statement 
Using the UCI PIMA Indian Diabetes dataset to predict a person has diabetes or not using the medical attributes provided. (Target is column 8)

### Assumptions

This is enough data to split and reliably predict if the patient has diabetes, the dataset has only 786 data points
Just these attributes are enough to diagnose the ailment
Similar Problems 
This is very much like some common 2 class classification problems like classifying mail into spam and ham based on the contents of the email. Obviously the attributes there would be strings and not numbers like this dataset, therefore the way in which we process at least some of the features will be different.

In [1]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
X=df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]
y=df['Outcome']

In [6]:
len(df)

768

In [7]:
sum(df['Outcome'])

268

### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [10]:
log_reg=LogisticRegression()

#### Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score
cross_val_score(log_reg,X,y,cv=4,scoring="recall").mean()



0.5447761194029851

In [12]:
log_reg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### K-vecinos

In [13]:
from sklearn.neighbors import KNeighborsClassifier 
regk = KNeighborsClassifier()

#### GridSearchCV

In [14]:
from sklearn.model_selection import GridSearchCV

clf_test = GridSearchCV(KNeighborsClassifier(),
                       param_grid={"n_neighbors":np.arange(2,20)},scoring='recall',cv=3)
# Fit will test all of the combinations
clf_test.fit(X,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [15]:
clf_test.best_params_

{'n_neighbors': 9}

In [16]:
clf_test.best_score_

0.5671180880669997

#### Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf_test.best_estimator_,X,y,cv=3,scoring="recall").mean()

0.5670828131502289

### SVM

In [18]:
# Load the library
from sklearn.svm import SVC
# Create an instance of the classifier
clf = SVC(kernel="linear",C=10)
# Fit the data
clf.fit(X,y)
cross_val_score(clf,X,y,scoring="recall").mean()



0.563337494798169

```python
clfSVC=GridSearchCV(SVC(kernel='linear'),param_grid={'C':[1,10,100,1000,100000],'degree':[2,3,4,5]},scoring='recall')
clfSVC.fit(X,y)
```

```python
cross_val_score(clfSVC,X,y,scoring="recall").mean()
```

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [24]:
clfSVC=RandomizedSearchCV(SVC(),param_distributions={'C':np.arange(5,20.1)},cv=3,n_iter=20,scoring='recall')

In [25]:
cross_val_score(clfSVC,X,y,scoring="recall").mean()









0.0

### Decision Tree

In [54]:
# Import library
from sklearn.tree import DecisionTreeClassifier
# Create instance
clf_tree = DecisionTreeClassifier(min_samples_leaf=35,max_depth=4)
# Fit the data
clf_tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=35, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [55]:
clf_test=GridSearchCV(DecisionTreeClassifier(),param_grid={'min_samples_leaf':np.arange(30,50),'max_depth':np.arange(1,20)},scoring='recall')
clf_test.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
 

In [56]:
clf_test.best_params_

{'max_depth': 4, 'min_samples_leaf': 35}

In [57]:
cross_val_score(clf_test.best_estimator_,X,y,cv=3,scoring="recall").mean()

0.6422388680815647

## Ahora hacemos el Ensemble

In [61]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
clf_rf=RandomForestClassifier(max_depth=3,
                               min_samples_leaf=3,
                               n_estimators=100,
                               n_jobs=-1)

```python
clf_grid_rf=GridSearchCV(clf_rf,param_grid={'max_depth':np.arange(2,20),'min_samples_leaf':np.arange(10,50)},cv=3,scoring='recall')
```

```python
clf_grid_rf.fit(X_train,y_train)
```

```python
cross_val_score(clf_grid_rf.best_estimator_,X,y,cv=3,scoring="recall").mean()
```