In [2]:
import pickle

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [8]:
df = pd.read_csv('data/modeling_data/no_sw_mtf.csv')

In [26]:
df.columns

Index(['yr_adminst', 'region', 'msa', 'smsa', 'evr_smoked_cig',
       'cig_month_freq', 'alcohol_lifetime_freq', 'alcohol_yr_freq',
       'alcohol_2weeks', 'sex', 'area_type', 'marital_status', 'has_father',
       'has_mother', 'has_siblings', 'father_educ_lvl', 'mother_educ_lvl',
       'mother_employed', 'political_value_type', 'relig_attd',
       'relig_importance', 'academic_self_rating', 'intelligence_self_rating',
       'school_missed_illness', 'school_missed_ditched', 'school_missed_other',
       'skipped_class', 'avg_grade', 'tech_school_after_hs',
       'military_after_hs', '2yrcoll_after_hs', '4yrcoll_after_hs',
       'gradsch_after_hs', 'desire_tech_school', 'desire_military',
       'desire_2yrcoll', 'desire_4yrcoll', 'desire_gradsch', 'desire_none',
       'work_hrs', 'work_pay', 'other_income', 'rec_time', 'date_freq',
       'drive_freq', '12mo_r_tcktd', '12mo_accidents', 'binary_drug'],
      dtype='object')

In [14]:
for col in df.columns:
    print(df[col].value_counts(), '\n\n')

1995    8776
1996    8077
1997    7187
2007    6986
1998    6965
2006    6961
2003    6934
2005    6796
2004    6745
2010    6318
2002    6190
2008    6188
2009    6096
2011    6085
2012    5738
2019    4531
2015    4461
2014    4414
1999    4154
2001    3789
2000    3644
Name: yr_adminst, dtype: int64 


South        51586
Midwest      41042
NorthEast    31374
West          3033
Name: region, dtype: int64 


0    86344
1    40691
Name: msa, dtype: int64 


1    98995
0    28040
Name: smsa, dtype: int64 


Never                64010
1-2x                 25249
Occasionally         16650
Regularly Now        14235
Regularly in Past     6891
Name: evr_smoked_cig, dtype: int64 


None              96914
<1 Cigarettes     12400
1-5 Cigarettes     8856
2 Packs            5067
1 Pack             2995
1.5 Packs           578
2+ Packs            225
Name: cig_month_freq, dtype: int64 


0         30756
40+       27832
10-19X    16308
20-39X    14748
3-5X      14194
6-9X      12665
1-2X      105

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127035 entries, 0 to 127034
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   yr_adminst                127035 non-null  int64 
 1   region                    127035 non-null  object
 2   msa                       127035 non-null  int64 
 3   smsa                      127035 non-null  int64 
 4   evr_smoked_cig            127035 non-null  object
 5   cig_month_freq            127035 non-null  object
 6   alcohol_lifetime_freq     127035 non-null  object
 7   alcohol_yr_freq           127035 non-null  object
 8   alcohol_2weeks            127035 non-null  object
 9   sex                       127035 non-null  object
 10  area_type                 127035 non-null  object
 11  marital_status            127035 non-null  object
 12  has_father                127035 non-null  object
 13  has_mother                127035 non-null  object
 14  has_

In [16]:
X = df.drop(["binary_drug", "smsa"], axis=1)
y = df["binary_drug"]

In [19]:
# Note: we are not doing a train-test split, since we already have a "final"
# model chosen based on some previous train-test split. We want the best possible
# model, so we fit with the entire training set.

# Instantiate a pipeline that performs all preprocessing steps
pipe = Pipeline(steps=[
       ("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore")),
       ("logistic_model", LogisticRegression(max_iter=10000, C=0.1))])

In [20]:
# Fit the pipeline on the full dataset
pipe.fit(X, y)

Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('logistic_model', LogisticRegression(C=0.1, max_iter=10000))])

In [21]:
# Not needed, but print out the coefficients as a way to demonstrate that the
# model was successfully fitted
print("coefficients")
print(pipe.named_steps["logistic_model"].coef_)

# Save the fitted pipeline
with open("src/models/model.pkl", 'wb') as f:
    pickle.dump(pipe, f)

coefficients
[[-5.32244034e-01 -4.40360028e-01 -3.87914409e-01 -4.86894276e-01
  -4.40103072e-01 -4.01147876e-01 -2.18115751e-01 -1.54255002e-01
  -1.64660110e-01 -1.44238627e-01 -1.81662304e-02 -9.26911163e-02
  -9.88541061e-02 -5.00211355e-02  6.05503536e-02  2.24383662e-01
   3.99503193e-01  4.06440068e-01  6.90329160e-01  7.25483334e-01
   1.12594879e+00 -1.22516601e-01  5.15633538e-02 -4.69261883e-02
   1.20852221e-01  2.21197166e-02 -1.91469311e-02 -1.56055554e-01
  -9.93653371e-01  1.91954660e-01  6.11081114e-01  3.49645937e-01
   1.48650413e-01  3.02527389e-02  9.40603601e-02  4.37542565e-02
   2.07034897e-01 -1.01005086e-01 -4.19774795e-01 -5.09432483e-01
  -3.02252568e-01  1.64224847e-01  3.34104555e-01 -1.39973740e-01
   4.08531539e-01  4.77706355e-02 -1.06198996e+00 -4.66697957e-01
   3.34267973e-01  4.87084168e-01 -9.51064645e-02  6.59355601e-01
   1.46059422e-01 -8.16709748e-02  1.64601497e-01  1.20509580e-01
  -2.38751037e-01 -1.54084195e-02  5.36921404e-02 -4.07305737e-

In [22]:
preds = pipe.predict(X)

In [23]:
from sklearn.metrics import accuracy_score, recall_score

In [24]:
accuracy_score(y, preds)

0.8019443460463652

In [25]:
recall_score(y, preds)

0.7076151219918394