In [1]:
import os, pandas

import tensorflow_data_validation as tfdv

rawurl = 'https://raw.githubusercontent.com/GoogleCloudPlatform/training-data-analyst/refs/heads/master/'

giturl = 'courses/machine_learning/deepdive2/production_ml/solutions/data/'

fileurl = os.path.join(rawurl, giturl)

In [2]:
train = pandas.read_table(os.path.join(fileurl, 'score_train.csv'), sep=',')

train.iloc[:5]

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
0,No,Healthcare,1.0,4.0,Low
1,Yes,Engineer,,3.0,Average
2,Yes,Engineer,1.0,1.0,Low
3,Yes,Lawyer,0.0,2.0,High
4,Yes,Entertainment,,6.0,High


In [3]:
train.tail()

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
3995,No,Homemaker,8.0,,Low
3996,Yes,Artist,1.0,2.0,Low
3997,Yes,Artist,6.0,2.0,Average
3998,No,Lawyer,0.0,2.0,High
3999,Yes,Artist,0.0,,Average


In [4]:
null = train.isnull().sum()

null.sum()

672

In [5]:
test = pandas.read_table(os.path.join(fileurl, 'score_test.csv'), sep=',')

test.iloc[:5]

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
0,No,Doctor,0.0,5.0,Average
1,Yes,Entertainment,1.0,4.0,Average
2,No,Lawyer,0.0,5.0,Low
3,Yes,Executive,1.0,5.0,High
4,Yes,Artist,1.0,2.0,Average


In [6]:
test.tail()

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
4063,No,,0.0,7.0,Low
4064,No,Executive,3.0,4.0,Low
4065,Yes,Healthcare,1.0,1.0,Low
4066,Yes,Healthcare,1.0,4.0,Low
4067,Yes,Executive,0.0,3.0,Average


In [7]:
null = test.isnull().sum()

null.sum()

694

`Generate & Visualize Train Stats`

In [8]:
from tensorflow_data_validation import generate_statistics_from_dataframe

from tensorflow_data_validation import visualize_statistics

train_stats = generate_statistics_from_dataframe(train)

visualize_statistics(train_stats)

In [9]:
test_stats = generate_statistics_from_dataframe(test)

visualize_statistics(test_stats)

`Comparing Train & Test Stats`

In [10]:
statlist = {'lhs_statistics':train_stats, 'rhs_statistics':test_stats}

statlist['lhs_name'] = 'Train Stats'; statlist['rhs_name'] = 'Test Stats'

visualize_statistics(**statlist)

`Create Train Schema (Main Schema)`

In [11]:
from tensorflow_data_validation import infer_schema, display_schema

schema = infer_schema(train_stats)

display_schema(schema)

#### `Spending Score & Graduate`

`Spending Score`

`Min Fraction Define Null Ratio on Particular Column`

`1.0 Min Fraction : Empty Null Ratio`

`0.5 Min Fraction : 50 % Null Ratio is Acceptable`

In [12]:
from tensorflow_data_validation import get_feature

SpendingScoreFeature = get_feature(schema, 'Spending_Score')

SpendingScoreFeature.presence.min_fraction

1.0

`Min Count Define Minimum Occurence on Value in Particular Column`

`1 Min Count : A Value in A Column Has to Occur Minimum One Time`

In [13]:
SpendingScoreFeature.presence.min_count

1

`Graduate`

In [14]:
GraduateFeature = get_feature(schema, 'Graduated')

GraduateFeature.presence.min_fraction

0.0

In [15]:
GraduateFeature.presence.min_count

1

`Further !`

In [16]:
1.0 - train['Graduated'].isnull().sum() / train.shape[0]

# 1.0 - 36 / 4000

0.991

In [17]:
1.0 - test['Graduated'].isnull().sum() / test.shape[0]

# 1.0 - 42 / 4068

0.9896755162241888

In [18]:
from tensorflow_data_validation import validate_statistics

from tensorflow_data_validation import display_anomalies

GraduateFeature.presence.min_fraction = 1.0

anomalies = validate_statistics(train_stats, schema)

display_anomalies(anomalies)

# Return : Feature Present in Examples Less than Expected

# Min Fraction = 1.0

# Actual Fraction = 0.991 (Check Train Graduated Null Ratio !)

In [19]:
GraduateFeature.presence.min_fraction = 0.0

anomalies = validate_statistics(train_stats, schema)

display_anomalies(anomalies)

# Return : Clear of Anomalies

In [20]:
from tensorflow_data_validation import get_domain

ProfessionDomain = get_domain(schema, 'Profession')

AcceptedProfessionValue = ProfessionDomain.value

AcceptedProfessionValue[:5]

['Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive']

In [21]:
AcceptedProfessionValue[5:]

['Healthcare', 'Homemaker', 'Lawyer', 'Marketing']

In [22]:
ProfessionDomain.value.insert(0, 'Self-Employed')

ProfessionDomain.value.remove('Homemaker')

AcceptedProfessionValue = ProfessionDomain.value

AcceptedProfessionValue[:5]

['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment']

In [23]:
AcceptedProfessionValue[5:]

['Executive', 'Healthcare', 'Lawyer', 'Marketing']

#### `Column Type`

In [24]:
# Bytes (1)

GraduateFeature.type

1

In [25]:
# Float (3)

FamilySize = get_feature(schema, 'Family_Size')

FamilySize.type

3

In [26]:
# Integer (2)

FamilySize = get_feature(schema, 'Family_Size')

FamilySize.type = 2