In [1]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
import pydotplus

from IPython.display import Image

diabetes_df = pd.read_csv("diabetes.csv")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

#Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

### Tuning Model to optimize

#### Updated the max_depth to 5 (changed between 0-10 to make sure I selected optimized value)

In [30]:
# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 5,random_state=42)

In [31]:
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[83 17]
 [14 40]]


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.83      0.84       100
           1       0.70      0.74      0.72        54

    accuracy                           0.80       154
   macro avg       0.78      0.79      0.78       154
weighted avg       0.80      0.80      0.80       154



In [70]:
from graphviz import Digraph
import pydotplus

dot_data = tree.export_graphviz(
    model, out_file =None, feature_names=X.columns, class_names = ["0","1"], filled = True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

ModuleNotFoundError: No module named 'graphviz'

### Including oversampling in preprocessing before running model

#### The recall and precision decreased here so I'm pretty sure I did something obviously wrong, but I don't have time right now to investigate. If you see where I went wrong and feel like adding a comment, that would be awesome.

In [77]:
#our previous model had a recall of .5
#Following code taken from in-class discussion/lecture


#Setting X and y values
X = diabetes_df.drop('Outcome',axis = 1)
y = diabetes_df['Outcome']

#Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

#Standardize
sc = StandardScaler()
X_train_scaler = sc.fit_transform(X_train)
X_test_scaler = sc.fit_transform(X_test)

In [78]:
#Resample the training data with RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state = 42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaler, y_train)

In [79]:
#train using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [96]:
# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 4,random_state=42)

In [97]:
model = model.fit(X_resampled, y_resampled)
y_pred_resampled = model.predict(X_test)

In [98]:
cm = confusion_matrix(y_test, y_pred_resampled)
print(cm)

[[49 51]
 [21 33]]


In [99]:
print(classification_report(y_test, y_pred_resampled))

              precision    recall  f1-score   support

           0       0.70      0.49      0.58       100
           1       0.39      0.61      0.48        54

    accuracy                           0.53       154
   macro avg       0.55      0.55      0.53       154
weighted avg       0.59      0.53      0.54       154



### 2. Create a function that accepts an array of names and returns a string formatted as a list of names separated by commas EXCEPT for the last two names, which are separated by an ampersand (and sign - &)
#### Example input:
#### [ {'name': 'Nichole'}, {'name': 'Tanisha'}, {'name': 'Maggie'} ] Example output:
#### Nichole, Tanisha & Maggie

In [106]:
def name_combo(n_array):
    name_list=[]
    for n in n_array:
        name = n['name']
        name_list.append(name)
    if len(name) == 1:
        return name
    elif len(name) == 2:
        return(name[0] + " & " + name[1])
    #print(name_list)
    else:
        name_len_minus1 = len(name_list)-1
        name_len_minus2 = len(name_list)-2
        #print(name)
        new_list=[]
        for a in name_list:
            if name_list.index(a) < name_len_minus2:
                new_list.append(a + ", ")
            if name_list.index(a) == name_len_minus2:
                new_list.append(a + " & ")
            elif name_list.index(a) == name_len_minus1:
                new_list.append(a)
        return ''.join(new_list)

name_array =[{'name': 'Nichole'}, {'name': 'Tanisha'}, {'name': 'Maggie'}]       
print(name_combo(name_array))

name_array_test = [{'name': 'Nichole'}, {'name': 'Tanisha'}, {'name': 'Maggie'}, {'name': 'Jenny'}]
print(name_combo(name_array_test))

name_array_test2 = [{'name': 'Nichole'}]
print(name_combo(name_array_test2))

name_array_test3 = [{'name': 'Nichole'}, {'name': 'Tanisha'}]
print(name_combo(name_array_test3))

Nichole, Tanisha & Maggie
Nichole, Tanisha, Maggie & Jenny
Nichole
Nichole & Tanisha
