In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Let's use the iris dataset 

In [4]:
iris = sns.load_dataset('iris')
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


#### There are three kinds of irises in the data and they are evenly represented as we can see by calling `iris.species.value_counts()`

In [5]:
iris.species.value_counts()

virginica     50
versicolor    50
setosa        50
Name: species, dtype: int64

#### If we want to predict whether or not a particular iris in the dataset is or isn't a member of the virginica class how would we do that?
 - create a label
 - divide into train and test
 - train a classifier
 - evaluate 

In [8]:
iris['virginica_member'] = [1 if species == 'virginica' else 0 for species in iris.species]
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,virginica_member
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0


In [9]:
iris.virginica_member.value_counts()

0    100
1     50
Name: virginica_member, dtype: int64

#### [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html) preserves the ratio of classes 
 - create the splitter
 - split the data

In [15]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
splits = splitter.split(iris, iris.virginica_member)
train_indices, test_indices = next(splits)
train = iris.iloc[train_indices]
test = iris.iloc[test_indices]

In [16]:
print(train.shape)
print(test.shape)

(120, 6)
(30, 6)


#### Data for the model must be numeric
    - drop the species column (because encoding it would not make sense)
    - keeping the split with that column in case I want to dig into which species were incorrectly classified later

In [17]:
train2 = train.drop(columns = ['species'])
test2 = test.drop(columns = ['species'])

#### Need to further split each set into predictor variables (`X`) and labels (`y`)

In [18]:
X_train = train2.drop(columns = ['virginica_member'])
y_train = train2.virginica_member
X_test = test2.drop(columns = ['virginica_member'])
y_test = test2.virginica_member

#### We'll try the simplest approach first: a logistic regression model
 - instantiate the model
 - fit it to the training data
 - use it to predict with the test data
 - evaluate accuracy