# Machine Learning with Scikit-Learn

#### Import Libraries

In [30]:
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#### Load data from BigQuery table

In [5]:
%%bigquery df
SELECT *
FROM `crazy-hippo-01.earnings_ml.census_data`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 371.21query/s] 
Downloading: 100%|██████████| 32461/32461 [00:01<00:00, 20350.12rows/s]


In [36]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,65,?,293385,Preschool,1,Married-civ-spouse,?,Husband,Black,Male,0,0,30,United-States,<=50K
1,64,?,140237,Preschool,1,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K
2,52,?,248113,Preschool,1,Married-spouse-absent,?,Other-relative,White,Male,0,0,40,Mexico,<=50K
3,54,?,148657,Preschool,1,Married-civ-spouse,?,Wife,White,Female,0,0,40,Mexico,<=50K
4,39,?,362685,Preschool,1,Widowed,?,Not-in-family,White,Female,0,0,20,El-Salvador,<=50K


#### Make Feature Selection and assign Target Column

In [129]:
X = df[['age', 'workclass', 'sex', 'occupation', 'education_num', 'marital_status', 'relationship', 'capital_gain']]
y = df[['income']]

#### Perform ML Data Preparation with One-hit Encoding and Normalization

In [None]:
#One-hot encode data using Pandas get_dummies function
X = pd.get_dummies(X, prefix=['workclass', 'gender','occupation','marital_status','relationship'])

#Normalize data using Scikit-learn function
scaler = preprocessing.MinMaxScaler()
X[['age','education_num','capital_gain']] = scaler.fit_transform(X[['age','education_num','capital_gain']])

#### Check to see all the features that has been created during Data Preparation

In [181]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32461 entries, 0 to 32460
Data columns (total 42 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   age                                    32461 non-null  float64
 1   education_num                          32461 non-null  float64
 2   capital_gain                           32461 non-null  float64
 3   workclass_ ?                           32461 non-null  uint8  
 4   workclass_ Federal-gov                 32461 non-null  uint8  
 5   workclass_ Local-gov                   32461 non-null  uint8  
 6   workclass_ Never-worked                32461 non-null  uint8  
 7   workclass_ Private                     32461 non-null  uint8  
 8   workclass_ Self-emp-inc                32461 non-null  uint8  
 9   workclass_ Self-emp-not-inc            32461 non-null  uint8  
 10  workclass_ State-gov                   32461 non-null  uint8  
 11  wo

#### Perform Data Preparation for the y Labels (they are strings now)

In [131]:
# Change label string into integer to be able to use in model training
le = preprocessing.LabelEncoder()
le.fit(y['income'])
y['income'] = le.transform(y['income'])
y = y['income'].values

LabelEncoder()

#### Spitt data into Training and Test data via Scikit-learn train_test_split function

In [144]:
#Split data in train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

#### Initialize Logistic Regression classifier and fit (train) to your created data

In [146]:
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=200).fit(X_train, y_train)


#### Set up Prediction Function that takes in a number of entries from X_test 

In [163]:
def predict_func(data):
    predictions = LR.predict(data)
    
    for response in predictions:
        if response == 0:
            print('Under 50K')
        else :
            print('Over 50K')


In [183]:
predict_func(X_test[10:12])

Under 50K
Over 50K


#### Return the mean accuracy on the given test data and labels using the scikit-learn SCORE function

In [151]:
LR.score(X_test, y_test)

0.8410596026490066