# Core learning algorithms
This "chapter" talks about linear regression.



In [1]:
# import stuffs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc


## Get data
The code below is what i found in github:  
https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/structured/titanic.py

In [2]:
import tensorflow as tf
from tensorflow_datasets.core import lazy_builder_import

Titanic = lazy_builder_import.LazyBuilderImport('titanic')
type(Titanic) # tensorflow_datasets.core.lazy_builder_import.LazyBuilderImport

  from .autonotebook import tqdm as notebook_tqdm


tensorflow_datasets.core.lazy_builder_import.LazyBuilderImport

This is what the instructor used. The data is stored on the remote:  
https://storage.googleapis.com/tf-datasets/titanic/train.csv  
https://storage.googleapis.com/tf-datasets/titanic/eval.csv  
To avoid calling api each time i run the script, i downloaded the csv.

In [3]:
# load dataset
dftrain = pd.read_csv("./data/titanic/train.csv") # training data
dfeval = pd.read_csv("./data/titanic/eval.csv") # testing data
y_train = dftrain.pop("survived") # "survived" is a column's name
y_eval = dfeval.pop("survived")
# take a look
print(dftrain.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  


A statistical analysis. Use pandas' .describe() method.

In [4]:
dftrain.describe()

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


## Feature columns
There are two kinds of columns: numeric and categorical.  
Categorical data is anything that is not numeric.  
Before training the model, we need to replace categorical data by numeric data.  
Tensorflow can do this automatically.  

In [5]:
CATEGORICAL_COLUMNS = list(dftrain.columns)
print("Column names are %s" % CATEGORICAL_COLUMNS)
# If i use [] instead of list() to create list, it gives:
# CATEGORICAL_COLUMNS = [dftrain.columns]
# [Index(['sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck',
#         'embark_town', 'alone'],
#        dtype='object')]

NUMERIC_COLUMNS = []
NUMERIC_COLUMNS.append(CATEGORICAL_COLUMNS.pop(CATEGORICAL_COLUMNS.index('age'))) # delete "age" from CATEGORICAL_COLUMNS and add to NUMERIC_COLUMNS
NUMERIC_COLUMNS.append(CATEGORICAL_COLUMNS.pop(CATEGORICAL_COLUMNS.index('fare')))
print("Categorical columns are %s" % CATEGORICAL_COLUMNS)
print("Numeric columns are %s" % NUMERIC_COLUMNS)
print("  ")

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS: # here feature_name means column name
    vocabulary = dftrain[feature_name].unique()
    #             ^^^^^^^^^^^^^^^^^^^^           get all rows of the column
    #                                  ^^^^^^^   find unique values
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
    #                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    # here, tf automatically replace catigorical data by numeric data

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(' ')
print(feature_columns)

# WARNING:tensorflow:From C:\Users\eziod\AppData\Local\Temp\ipykernel_3880\535747530.py:21: categorical_column_with_vocabulary_list (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
# Instructions for updating:
# Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
# WARNING:tensorflow:From C:\Users\eziod\AppData\Local\Temp\ipykernel_3880\535747530.py:24: numeric_column (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
# Instructions for updating:
# Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


Column names are ['sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
Categorical columns are ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
Numeric columns are ['age', 'fare']
  
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
 
[VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=

## Batch and epochs
- Batch: number of training examples utilized in one iteration. = how many data we feed to the model once.
- epochs: how many times the model will see the same data. Ex. if we have 10 epochs, then our model will see the same dataset 10 times.
  
## Input function
We need to build an input function to feed the model.

In [14]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000) # randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs) # split dataset into batches of 32 and repeat process for number of epochs
        return ds # return a batch of the dataset
    return input_function

train_input_fn = make_input_fn(dftrain, y_train)
print("Type of input_function is %s" % type(train_input_fn))
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

Type of ds (input_function) is <class 'function'>


## train the model

In [15]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns) # i suppose "est" stands for estimation

linear_est.train(train_input_fn) # train
result = linear_est.evaluate(eval_input_fn)

clear_output()
print("Type of linear_est is %s " % type(linear_est))
print("Type of result is %s " % type(result))
print("The accuracy is %s " % str(result['accuracy'])) # the result variable is simply a dict of stats about our model

# there is warning message telling me to use tf.keras instead.

# It takes 4 or 5 seconds to run this chunk of code.
# The accuracy varies each time.

Type of linear_est is <class 'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2'> 
Type of result is <class 'dict'> 
The accuracy is 0.75757575 


a bit information of the above code.  
`tf.estimator`is to train and evaluate tf models. It is deprecated.  

Explore the varialbe "result"



In [18]:
result # is a dict

accuracy 0.75757575
accuracy_baseline 0.625
auc 0.8345577
auc_precision_recall 0.7932865
average_loss 0.48431924
label/mean 0.375
loss 0.47795072
precision 0.6576577
prediction/mean 0.42756692
recall 0.7373737
global_step 200


## Predict
Still to need to use input function to feed the model.

In [23]:
result = list(linear_est.predict(eval_input_fn))
clear_output()
i = 3
print('dfeval (information about this passenger):')
print(dfeval.loc[i])
print('\ny_eval (the correct answer):')
print(y_eval.loc[i])
print('\nthe result (the probability that this passenger survives):')
print(result[i]['probabilities'][1])

dfeval (information about this passenger):
sex                        female
age                          55.0
n_siblings_spouses              0
parch                           0
fare                         16.0
class                      Second
deck                      unknown
embark_town           Southampton
alone                           y
Name: 3, dtype: object

y_eval (the correct answer):
1

the result (the probability that this passenger survives):
0.70570225


In [35]:
# i need to explore a bit the new "result" variable
i = 3
print(type(result[i])) # dict
print('\nk v pair:')
for k,v in result[i].items():
    print(k, '\t\t\t', v, '\t\t\t', type(v))

<class 'dict'>

k v pair:
logits 			 [0.8746014] 			 <class 'numpy.ndarray'>
logistic 			 [0.70570225] 			 <class 'numpy.ndarray'>
probabilities 			 [0.29429775 0.70570225] 			 <class 'numpy.ndarray'>
class_ids 			 [1] 			 <class 'numpy.ndarray'>
classes 			 [b'1'] 			 <class 'numpy.ndarray'>
all_class_ids 			 [0 1] 			 <class 'numpy.ndarray'>
all_classes 			 [b'0' b'1'] 			 <class 'numpy.ndarray'>
