/
auto_tokenize_messages.proto
234 lines (187 loc) · 6.88 KB
/
auto_tokenize_messages.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package com.google.cloud.solutions.autotokenize;
import "google/protobuf/timestamp.proto";
import "google/privacy/dlp/v2/dlp.proto";
import "google/cloud/datacatalog/v1/datacatalog.proto";
import "google/cloud/datacatalog/v1/tags.proto";
import "google/cloud/datacatalog/v1/schema.proto";
// Represents sampled DLP column-type information.
message DlpReport {
// The number of sample records sent to DLP for type identification.
int64 sample_size = 1;
// AVRO schema of the file. For Parquet file this contains the Parquet schema
// converted to AVRO schema.
string file_avro_schema = 5;
// Columns containing flagged InfoType.
repeated ColumnInformation columns_information = 10;
}
// The supported file types.
enum SourceType {
UNKNOWN_FILE_TYPE = 0;
AVRO = 1;
PARQUET = 2;
BIGQUERY_TABLE = 3;
BIGQUERY_QUERY = 4;
JDBC_TABLE = 5;
CSV_FILE = 6;
JDBC_QUERY = 7;
}
// The Encryption Key material type
enum KeyMaterialType {
UNKNOWN_KEY_MATERIAL_TYPE = 0;
TINK_GCP_KEYSET_JSON = 1;
RAW_BASE64_KEY = 2;
RAW_UTF8_KEY = 3;
GCP_KMS_WRAPPED_KEY = 4;
GCP_SECRET_KEY = 5;
}
// Source configuration for JDBC Databases/sources
message JdbcConfiguration {
// The JDBC Connection URL
// e.g. "jdbc:mysql://localhost/database?user=[user_name]&password=[password]
string connection_url = 1;
// The JDBC SQL Driver class name. The class must be available on the classpath.
string driver_class_name = 2;
// The Username for connecting to JDBC Server
string user_name = 3;
// The provided WHERE clause for inspection.
string filter_clause = 5;
oneof passwords{
// Plain text password to use for JDBC Connection.
string password = 6;
// Fully Qualified id of the secret stored in Cloud Secret Manager.
// Sample format: projects/[project-number]/secrets/[secret-id]/versions/[version-id]
string password_secrets_key = 7;
}
}
// The information types associated with each column of the input structured file.
// The column_name is in the JSONPath format.
message ColumnInformation {
// The JSONPath of the record's column (when represented as JSON)
string column_name = 1;
// The list of flagged DLP InfoTypes and their occurrences in this column.
repeated InfoTypeInformation info_types = 2;
}
// The detected DLP InfoType name and number of occurrences in the sample.
message InfoTypeInformation {
// The DLP InfoType name
string info_type = 1;
// The number of times this info type occurred in the column of the sample.
int64 count = 2;
}
// The information that is written out to BigQuery Table.
message InspectionReport {
// The timestamp of report generation
.google.protobuf.Timestamp timestamp = 1;
// The type of source that was inspected
SourceType source_type = 5;
// The Table name, or File pattern that was inspected
string input_pattern = 6;
// The configuration if the source is a SourceType.JDBC_TABLE
JdbcConfiguration jdbc_configuration = 7;
// The equivalent AVRO Schema of the source
string avro_schema = 10;
// The inspection results per column of the source
// Only contains columns that have inspection results.
repeated ColumnInformation column_report = 15;
}
// Flattened representation of an AVRO GenericRecord's JSON.
// The flattening allows a record with nested/repeated fields to be represented
// as a single row with varying number of columns. Each of the repeated field's
// columns is mapped to the field's schema name using the `flat_key_schema` map.
// This simplifies operations on a given schema column even when it is repeated.
// e.g.
// AVRO Record
// {
// "name": "Jane Doe",
// "contacts": [
// {
// "contact": {
// "type": "WORK",
// "number": 2124567890
// }
// },
// {
// "contact": {
// "type": "HOME",
// "number": 5304321234
// }
// }
// ]
// }
//
// Flattened Record:
// "flatKeySchema" : {
// "$.name" : "$.RootRecord.name",
// "$.contacts[0].contact.type": "$.RootRecord.contacts.contact.type",
// "$.contacts[0].contact.number": "$.RootRecord.contacts.contact.number",
// "$.contacts[1].contact.type": "$.RootRecord.contacts.contact.type",
// "$.contacts[1].contact.number": "$.RootRecord.contacts.contact.number"
// },
// "values" :
// {
// "$.name": "Jane Doe",
// "$.contacts[0].contact.type": "WORK",
// "$.contacts[0].contact.number": 2124567890,
// "$.contacts[1].contact.type": "HOME",
// "$.contacts[1].contact.number": 5304321234
// }
message FlatRecord {
// Optional unique record id field.
string record_id = 1;
// A map between the record key's to the equivalent schema field's key
map<string, string> flat_key_schema = 5;
// Flattened record, with value represented as DLP Value objects.
// The Value object allows simplified access to convert values to bytes.
map<string, .google.privacy.dlp.v2.Value> values = 10;
}
message DlpEncryptConfig {
repeated ColumnTransform transforms = 1;
}
// Specifies DLP Transform to Deidentify a column's information.
message ColumnTransform {
// The JSON-path of the column to be de-identified.
string column_id = 1;
// Handle the column as free-form to tokenize specific InfoTypes within the
// free-form text.
bool free_form_column = 2;
// The InfoType to tokenize in free-form text
// e.g. PHONE_NUMBER, EMAIL_ADDRESS.
// Refer: https://cloud.google.com/dlp/docs/infotypes-reference#global
// used only when free_form_column=True, keep empty to use all InfoTypes.
repeated string info_types = 3;
// Transform to be used for tokenizing entire column value
.google.privacy.dlp.v2.PrimitiveTransformation transform = 5;
}
// Represented a partially batched FlatRecords for DLP.
message PartialColumnDlpTable {
string record_id_column_name = 1;
repeated FlatRecord records = 2;
.google.privacy.dlp.v2.Table table = 5;
.google.privacy.dlp.v2.DeidentifyConfig deidentify_config = 10;
}
message CatalogSchemaWithAvroMapping {
string avro_schema_string = 1;
.google.cloud.datacatalog.v1.Schema catalog_schema = 5;
map<string, string> flat_schema_key_mapping = 10;
}
// Entity and Tags to be created in Data Catalog.
message UpdatableDataCatalogItems {
SourceType source_type = 1;
string input_pattern = 2;
.google.cloud.datacatalog.v1.Entry entry = 5;
repeated .google.cloud.datacatalog.v1.Tag tags = 10;
}