Skip to content

Commit

Permalink
Feat: Onboard IMDb dataset (#406)
Browse files Browse the repository at this point in the history
  • Loading branch information
vijay-google committed Jul 8, 2022
1 parent 51860eb commit 2559838
Show file tree
Hide file tree
Showing 9 changed files with 1,674 additions and 90 deletions.
2 changes: 1 addition & 1 deletion datasets/imdb/infra/imdb_dataset.tf
Expand Up @@ -18,7 +18,7 @@
resource "google_bigquery_dataset" "imdb" {
dataset_id = "imdb"
project = var.project_id
description = "aclImdb_v1 dataset"
description = "It consistes of reviews dataset along with all IMDb interfaces(7 - datasets)."
}

output "bigquery_dataset-imdb-dataset_id" {
Expand Down
142 changes: 142 additions & 0 deletions datasets/imdb/infra/interfaces_pipeline.tf
@@ -0,0 +1,142 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "imdb_name_basics" {
project = var.project_id
dataset_id = "imdb"
table_id = "name_basics"
description = "It consists details about unique identifier of the name/person."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_name_basics-table_id" {
value = google_bigquery_table.imdb_name_basics.table_id
}

output "bigquery_table-imdb_name_basics-id" {
value = google_bigquery_table.imdb_name_basics.id
}

resource "google_bigquery_table" "imdb_title_akas" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_akas"
description = "It consists details about unique identifier of the title_id."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_akas-table_id" {
value = google_bigquery_table.imdb_title_akas.table_id
}

output "bigquery_table-imdb_title_akas-id" {
value = google_bigquery_table.imdb_title_akas.id
}

resource "google_bigquery_table" "imdb_title_basics" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_basics"
description = "It consists additional details about unique identifier of the title_id."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_basics-table_id" {
value = google_bigquery_table.imdb_title_basics.table_id
}

output "bigquery_table-imdb_title_basics-id" {
value = google_bigquery_table.imdb_title_basics.id
}

resource "google_bigquery_table" "imdb_title_crew" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_crew"
description = "Contains the director and writer information for all the titles in IMDb."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_crew-table_id" {
value = google_bigquery_table.imdb_title_crew.table_id
}

output "bigquery_table-imdb_title_crew-id" {
value = google_bigquery_table.imdb_title_crew.id
}

resource "google_bigquery_table" "imdb_title_episode" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_episode"
description = "Contains the tv episode information."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_episode-table_id" {
value = google_bigquery_table.imdb_title_episode.table_id
}

output "bigquery_table-imdb_title_episode-id" {
value = google_bigquery_table.imdb_title_episode.id
}

resource "google_bigquery_table" "imdb_title_principals" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_principals"
description = "Contains the principal cast/crew for titles."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_principals-table_id" {
value = google_bigquery_table.imdb_title_principals.table_id
}

output "bigquery_table-imdb_title_principals-id" {
value = google_bigquery_table.imdb_title_principals.id
}

resource "google_bigquery_table" "imdb_title_ratings" {
project = var.project_id
dataset_id = "imdb"
table_id = "title_ratings"
description = "Contains the IMDb rating and votes information for titles."
depends_on = [
google_bigquery_dataset.imdb
]
}

output "bigquery_table-imdb_title_ratings-table_id" {
value = google_bigquery_table.imdb_title_ratings.table_id
}

output "bigquery_table-imdb_title_ratings-id" {
value = google_bigquery_table.imdb_title_ratings.id
}
2 changes: 1 addition & 1 deletion datasets/imdb/infra/reviews_pipeline.tf
Expand Up @@ -19,7 +19,7 @@ resource "google_bigquery_table" "imdb_reviews" {
project = var.project_id
dataset_id = "imdb"
table_id = "reviews"
description = "Reviews table"
description = "Large Movie Review Dataset v1.0\n\nOverview\n\nThis dataset contains movie reviews along with their associated binary\nsentiment polarity labels. It is intended to serve as a benchmark for\nsentiment classification. This document outlines how the dataset was\ngathered, and how to use the files provided.\n\nDataset\n\nThe core dataset contains 50,000 reviews split evenly into 25k train\nand 25k test sets. The overall distribution of labels is balanced (25k\npos and 25k neg). We also include an additional 50,000 unlabeled\ndocuments for unsupervised learning.\n\nIn the entire collection, no more than 30 reviews are allowed for any\ngiven movie because reviews for the same movie tend to have correlated\nratings. Further, the train and test sets contain a disjoint set of\nmovies, so no significant performance is obtained by memorizing\nmovie-unique terms and their associated with observed labels. In the\nlabeled train/test sets, a negative review has a score \u003c= 4 out of 10,\nand a positive review has a score \u003e= 7 out of 10. Thus reviews with\nmore neutral ratings are not included in the train/test sets. In the\nunsupervised set, reviews of any rating are included and there are an\neven number of reviews \u003e 5 and \u003c= 5.\n\nColumns\nsplit - it has test(25K) / train(75K) records.\nlabel - Negative(25K) --\u003e test(12.5K) and train (12.5K)\n Positive(25K) --\u003e test(12.5K) and train (12.5K)\n Unsupervised(50K) --\u003e train(50K)\n\nFor Unsupervised label, reviewer_rating is NaN.\n"
depends_on = [
google_bigquery_dataset.imdb
]
Expand Down

0 comments on commit 2559838

Please sign in to comment.