## Confidentiality

The programmatic cases in this notebook are utilized from different internet resources (in this notebook especially from kaggle.com) and are for demonstrational purposes only.

Please do not copy or distribute this notebook.

## Table of content

Census Income Data

1. Programmatic case 1 
2. Programmatic case 2
3. Programmatic case 3

## Previous knowledge

For a good understanding of this notebook you should have a few years of data-science and programming experience and have studied the advanced programming notebooks.

## Introduction

A supercase is a case for a dataset on which multiple data-science methods and techniques can be applied.
There is no predifined goal. The goal is to explore cases for the dataset with multiple data-science methods, techniques and programs.
The goal is to built more specific cases with specific goals. A supercase contains information to built multiple new specific cases. 



#### Programmatic case 1

In [None]:
######################################################################################################################
######################################################################################################################
##1) Prediction 1 - Advanced Program 

#1.1) Keras regression

In [None]:
# Use seaborn for pairplot
!pip install -q seaborn

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)


test_path = '/content/census_data.csv'

raw_dataset = pd.read_csv(test_path)

dataset = raw_dataset.copy()
dataset.tail()

dataset.isna().sum()

dataset = dataset.dropna()

dataset['education-num'] = dataset['education-num'].map({1: 'level_1', 2: 'level_2', 3: 'level_3'})

dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
dataset.tail()

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

sns.pairplot(train_dataset[['income_level_num', 'hours-per-week', 'capital-gain', 'capital-loss']], diag_kind='kde')

train_dataset.describe().transpose()

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('income_level_num')
test_labels = test_features.pop('income_level_num')

train_dataset.describe().transpose()[['mean', 'std']]

normalizer = preprocessing.Normalization()

normalizer.adapt(np.array(train_features))

print(normalizer.mean.numpy())

first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())
  
  
Age = np.array(train_features['Age'])

Age_normalizer = preprocessing.Normalization(input_shape=[1,])
Age_normalizer.adapt(Age)


Age_model = tf.keras.Sequential([
    Age_normalizer,
    layers.Dense(units=1)
])

Age_model.summary()

Age_model.predict(Age[:10])

Age_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')
	
	
history = Age_model.fit(
    train_features['Age'], train_labels,
    epochs=100,
    # suppress logging
    verbose=0,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2)
	
	
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [income_level_num]')
  plt.legend()
  plt.grid(True)
  
  
plot_loss(history)

test_results = {}

test_results['Age_model'] = Age_model.evaluate(
    test_features['Age'],
    test_labels, verbose=0)
	
x = tf.linspace(0.0, 250, 251)
y = Age_model.predict(x)


def plot_Age(x, y):
  plt.scatter(train_features['Age'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('Age')
  plt.ylabel('income_level_num')
  plt.legend()
  
plot_Age(x,y)


linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

linear_model.predict(train_features[:10])

linear_model.layers[1].kernel

linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')
	
	
history = linear_model.fit(
    train_features, train_labels, 
    epochs=100,
    # suppress logging
    verbose=0,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2)
	
	
plot_loss(history)

test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0)
	
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model
  
dnn_Age_model = build_and_compile_model(Age_normalizer)

dnn_Age_model.summary()

history = dnn_Age_model.fit(
    train_features['Age'], train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)
	
plot_loss(history)

x = tf.linspace(0.0, 250, 251)
y = dnn_Age_model.predict(x)

plot_Age(x, y)

test_results['dnn_Age_model'] = dnn_Age_model.evaluate(
    test_features['Age'], test_labels,
    verbose=0)
	
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

history = dnn_model.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)
	
plot_loss(history)

test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

pd.DataFrame(test_results, index=['Mean absolute error [income_level_num]']).T

test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [income_level_num]')
plt.ylabel('Predictions [income_level_num]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [income_level_num]')
_ = plt.ylabel('Count')


dnn_model.save('dnn_model')

reloaded = tf.keras.models.load_model('dnn_model')

test_results['reloaded'] = reloaded.evaluate(
    test_features, test_labels, verbose=0)
	
pd.DataFrame(test_results, index=['Mean absolute error [income_level_num]']).T


#### Programmatic case 2

In [None]:
######################################################################################################################
######################################################################################################################
##2) Prediction 2 - Advanced Program 

#2.1) DNN Classifier 

In [None]:

import tensorflow as tf
import pandas as pd

path = '/content/census_data.csv'

train = pd.read_csv(path)

train

CSV_COLUMN_NAMES = ['Age', 'education-num', 'hours-per-week', 'income_level_cat']
INCOME_LEVEL = ['<50K', '>50K']

train = train[["Age", "education-num", "hours-per-week", "income_level_cat"]]

train.head()

train.dtypes

train["income_level_cat2"] = train["income_level_cat"].astype('category')
train.dtypes

train["income_level_cat2"] = train["income_level_cat2"].cat.codes
train.head()


from sklearn.model_selection import train_test_split
# split into train test sets
X = train
X_train, X_test= train_test_split(X, train_size=0.70)
print(X_train.shape, X_test.shape)

# split features and dependent
train_y = X_train["income_level_cat2"]
train = X_train.drop("income_level_cat2", axis=1)
test_y = X_test["income_level_cat2"]
test = X_test.drop("income_level_cat2", axis=1)

train = X_train.drop(["income_level_cat","income_level_cat2"], axis=1)
test = X_test.drop(["income_level_cat","income_level_cat2"], axis=1)

train.head()

train_y = train_y.astype(np.int32)
test_y = test_y.astype(np.int32)

train.head()


import numpy as np
def input_evaluation_set():
    features = {'Age': np.array([39, 38]),
                'education-num':  np.array([13, 9]),
                'hours-per-week': np.array([40, 40])}
    labels = np.array([0, 0], dtype=np.int32)
    return features, labels
	

def input_fn(features, labels, training=True, batch_size=256):
    """An input function for training or evaluating"""
    # Converting the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffling and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)
	
# Feature columns describe how to use the input.
my_feature_columns = []
for key in train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
	
# Building a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30, 10],
    # The model must choose between 3 classes.
    n_classes=2)
	
train.head()

# Training the Model.
classifier.train(
    input_fn=lambda: input_fn(train, train_y, training=True),
    steps=5000)
	
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

# Generating predictions from the model
expected = ['<50K', '>50K']
predict_x = {
    'Age': [5.1, 5.9],
    'education-num': [3.3, 3.0],
    'hours-per-week': [1.7, 4.2],
}

def input_fn(features, batch_size=256):
    """An input function for prediction."""
    # Converting the inputs to a Dataset without labels.
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

predictions = classifier.predict(
    input_fn=lambda: input_fn(predict_x))
	
for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%), expected "{}"'.format(
        INCOME_LEVEL[class_id], 100 * probability, expec))


In [None]:
#### Programmatic case 3

In [None]:
######################################################################################################################
######################################################################################################################
##1) Prediction 3 - Advanced Program 

#3.1) Tensorflow linear models

In [None]:

!pip install -q sklearn

import numpy as np
import pandas as pd
import json, math, os, sys

import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

path = '/content/census_data.csv'

train = pd.read_csv(path)

train["income_level_cat2"] = train["income_level_cat"].astype('category')
train.dtypes

train["income_level_cat2"] = train["income_level_cat2"].cat.codes
train.head()

from sklearn.model_selection import train_test_split
# split into train test sets
X = train
X_train, X_test= train_test_split(X, train_size=0.70)
print(X_train.shape, X_test.shape)

# split features and dependent
y_train = X_train["income_level_cat2"]
dftrain = X_train.drop("income_level_cat2", axis=1)
y_eval = X_test["income_level_cat2"]
dfeval = X_test.drop("income_level_cat2", axis=1)

dftrain.head()

dftrain.describe()

dftrain.shape[0], dfeval.shape[0]

dftrain.Age.hist(bins=20)

dftrain.Gender.value_counts().plot(kind='barh')

dftrain['workclass'].value_counts().plot(kind='barh')

pd.concat([dftrain, y_train], axis=1).groupby('Gender').income_level_cat2.mean().plot(kind='barh').set_xlabel('% income_level_cat2')


CATEGORICAL_COLUMNS = ["workclass", "education", "marital-status", "occupation",
                       "relationship", "Clothing", "Gender", "native-country"]
NUMERIC_COLUMNS = ["Age", "education-num", "capital-gain", "capital-loss",
                      "hours-per-week"]

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
  
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

ds = make_input_fn(dftrain, y_train, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys()))
  print()
  print('A batch of class:', feature_batch['workclass'].numpy())
  print()
  print('A batch of Labels:', label_batch.numpy())
  
age_column = feature_columns[8]
tf.keras.layers.DenseFeatures([age_column])(feature_batch).numpy()

gender_column = feature_columns[6]
tf.keras.layers.DenseFeatures([tf.feature_column.indicator_column(gender_column)])(feature_batch).numpy()

linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

age_x_gender = tf.feature_column.crossed_column(['Age', 'Gender'], hash_bucket_size=100)

derived_feature_columns = [age_x_gender]
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns+derived_feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')

from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_eval, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,)
