In [63]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [73]:
def create_synthetic_dataset(N, shuffle):
    print("---------------- array and neutral class creation ----------------")
    # random array
    prescription = np.full(N, fill_value='acetominophen', dtype='U20')
    prescription[:N//2] = 'ibuprofen'
    # half of the set is acetominophen and the other half is ibuprofen
    # we shuffle the list
    np.random.shuffle(prescription)
    #print("prescription : ",prescription)
    
    # neutral class 
    p_neutral = np.full(N, fill_value='Neutral', dtype='U20')
    #print("p_neutral : ", p_neutral)
    #print("---------------- liver disease ----------------")
    # 10% is the patients with history of liver disease
    jaundice = np.zeros(N, dtype=bool)
    #print("jaundice : ", jaundice)
    #1/10th of the list is going to be true 
    jaundice[0:N//10] = True
    #print("jaundice_new : ", jaundice)
    prescription[0:N//10] = 'ibuprofen'
    p_neutral[0:N//10] = 'ibuprofen'
    #print("prescription_new : ", prescription)
    #print("p_neutral_new : ", p_neutral)
    
    #print("---------------- stomach problems ----------------")
    # 10% is patients with history of stomach problems
    ulcers = np.zeros(N, dtype=bool)
    ulcers[(9*N)//10:] = True
    prescription[(9*N)//10:] = 'acetominophen'
    p_neutral[(9*N)//10:] = 'acetominophen'
    #print("ulcers : ", ulcers)
    #print("prescription_stomach : ", prescription)
    #print("p_neutral_stomach : ", p_neutral)
    
    # Dataframe creation
    df = pd.DataFrame.from_dict({
        'jaundice': jaundice,
        'ulcers': ulcers,
        'prescription': prescription,
        'prescription_with_neutral': p_neutral
    })
    
    if shuffle:
        return df.sample(frac=1).reset_index(drop=True)
    else:
        return df
        

In [81]:
df = create_synthetic_dataset(1000, False)

---------------- array and neutral class creation ----------------


In [82]:
for label in ['prescription', 'prescription_with_neutral']:
    ntrain = 8*len(df)//10 # 80% of data for training
    lm = linear_model.LogisticRegression()
    lm = lm.fit(df.loc[:ntrain-1, ['jaundice', 'ulcers']], df[label][:ntrain])
    acc = lm.score(df.loc[ntrain:, ['jaundice', 'ulcers']], df[label][ntrain:])
    print('label={} accuracy={}'.format(label, acc))


label=prescription accuracy=0.215
label=prescription_with_neutral accuracy=0.5


In [83]:
df = create_synthetic_dataset(1000, True)
print(df)

for label in ['prescription', 'prescription_with_neutral']:
    ntrain = 8*len(df)//10 # 80% of data for training
    lm = linear_model.LogisticRegression()
    lm = lm.fit(df.loc[:ntrain-1, ['jaundice', 'ulcers']], df[label][:ntrain])
    acc = lm.score(df.loc[ntrain:, ['jaundice', 'ulcers']], df[label][ntrain:])
    print('label={} accuracy={}'.format(label, acc))


---------------- array and neutral class creation ----------------
     jaundice  ulcers   prescription prescription_with_neutral
0       False   False      ibuprofen                   Neutral
1       False   False      ibuprofen                   Neutral
2       False   False  acetominophen                   Neutral
3       False   False      ibuprofen                   Neutral
4       False   False      ibuprofen                   Neutral
..        ...     ...            ...                       ...
995     False   False      ibuprofen                   Neutral
996     False   False      ibuprofen                   Neutral
997     False    True  acetominophen             acetominophen
998     False   False      ibuprofen                   Neutral
999     False   False  acetominophen                   Neutral

[1000 rows x 4 columns]
label=prescription accuracy=0.595
label=prescription_with_neutral accuracy=1.0


## In the real world

In [88]:
%%bigquery
CREATE OR REPLACE MODEL mlpatterns.neutral_2classes
OPTIONS(model_type='logistic_reg', input_label_cols=['health']) AS

SELECT 
  IF(apgar_1min >= 9, 'Healthy', 'NeedsAttention') AS health,
  plurality,
  mother_age,
  gestation_weeks,
  ever_born
FROM `bigquery-public-data.samples.natality`
WHERE apgar_1min <= 10

UsageError: Cell magic `%%bigquery` not found.
