Feat: Onboard IMDb dataset (#406)

GoogleCloudPlatform · Jul 8, 2022 · 2559838 · 2559838
1 parent 51860eb
commit 2559838
Show file tree

Hide file tree

Showing 9 changed files with 1,674 additions and 90 deletions.
diff --git a/datasets/imdb/infra/imdb_dataset.tf b/datasets/imdb/infra/imdb_dataset.tf
@@ -18,7 +18,7 @@
 resource "google_bigquery_dataset" "imdb" {
   dataset_id  = "imdb"
   project     = var.project_id
-  description = "aclImdb_v1 dataset"
+  description = "It consistes of reviews dataset along with all IMDb interfaces(7 - datasets)."
 }
 
 output "bigquery_dataset-imdb-dataset_id" {

diff --git a/datasets/imdb/infra/interfaces_pipeline.tf b/datasets/imdb/infra/interfaces_pipeline.tf
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "imdb_name_basics" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "name_basics"
+  description = "It consists details about unique identifier of the name/person."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_name_basics-table_id" {
+  value = google_bigquery_table.imdb_name_basics.table_id
+}
+
+output "bigquery_table-imdb_name_basics-id" {
+  value = google_bigquery_table.imdb_name_basics.id
+}
+
+resource "google_bigquery_table" "imdb_title_akas" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_akas"
+  description = "It consists details about unique identifier of the title_id."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_akas-table_id" {
+  value = google_bigquery_table.imdb_title_akas.table_id
+}
+
+output "bigquery_table-imdb_title_akas-id" {
+  value = google_bigquery_table.imdb_title_akas.id
+}
+
+resource "google_bigquery_table" "imdb_title_basics" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_basics"
+  description = "It consists additional details about unique identifier of the title_id."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_basics-table_id" {
+  value = google_bigquery_table.imdb_title_basics.table_id
+}
+
+output "bigquery_table-imdb_title_basics-id" {
+  value = google_bigquery_table.imdb_title_basics.id
+}
+
+resource "google_bigquery_table" "imdb_title_crew" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_crew"
+  description = "Contains the director and writer information for all the titles in IMDb."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_crew-table_id" {
+  value = google_bigquery_table.imdb_title_crew.table_id
+}
+
+output "bigquery_table-imdb_title_crew-id" {
+  value = google_bigquery_table.imdb_title_crew.id
+}
+
+resource "google_bigquery_table" "imdb_title_episode" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_episode"
+  description = "Contains the tv episode information."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_episode-table_id" {
+  value = google_bigquery_table.imdb_title_episode.table_id
+}
+
+output "bigquery_table-imdb_title_episode-id" {
+  value = google_bigquery_table.imdb_title_episode.id
+}
+
+resource "google_bigquery_table" "imdb_title_principals" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_principals"
+  description = "Contains the principal cast/crew for titles."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_principals-table_id" {
+  value = google_bigquery_table.imdb_title_principals.table_id
+}
+
+output "bigquery_table-imdb_title_principals-id" {
+  value = google_bigquery_table.imdb_title_principals.id
+}
+
+resource "google_bigquery_table" "imdb_title_ratings" {
+  project     = var.project_id
+  dataset_id  = "imdb"
+  table_id    = "title_ratings"
+  description = "Contains the IMDb rating and votes information for titles."
+  depends_on = [
+    google_bigquery_dataset.imdb
+  ]
+}
+
+output "bigquery_table-imdb_title_ratings-table_id" {
+  value = google_bigquery_table.imdb_title_ratings.table_id
+}
+
+output "bigquery_table-imdb_title_ratings-id" {
+  value = google_bigquery_table.imdb_title_ratings.id
+}
diff --git a/datasets/imdb/infra/reviews_pipeline.tf b/datasets/imdb/infra/reviews_pipeline.tf
@@ -19,7 +19,7 @@ resource "google_bigquery_table" "imdb_reviews" {
   project     = var.project_id
   dataset_id  = "imdb"
   table_id    = "reviews"
-  description = "Reviews table"
+  description = "Large Movie Review Dataset v1.0\n\nOverview\n\nThis dataset contains movie reviews along with their associated binary\nsentiment polarity labels. It is intended to serve as a benchmark for\nsentiment classification. This document outlines how the dataset was\ngathered, and how to use the files provided.\n\nDataset\n\nThe core dataset contains 50,000 reviews split evenly into 25k train\nand 25k test sets. The overall distribution of labels is balanced (25k\npos and 25k neg). We also include an additional 50,000 unlabeled\ndocuments for unsupervised learning.\n\nIn the entire collection, no more than 30 reviews are allowed for any\ngiven movie because reviews for the same movie tend to have correlated\nratings. Further, the train and test sets contain a disjoint set of\nmovies, so no significant performance is obtained by memorizing\nmovie-unique terms and their associated with observed labels.  In the\nlabeled train/test sets, a negative review has a score \u003c= 4 out of 10,\nand a positive review has a score \u003e= 7 out of 10. Thus reviews with\nmore neutral ratings are not included in the train/test sets. In the\nunsupervised set, reviews of any rating are included and there are an\neven number of reviews \u003e 5 and \u003c= 5.\n\nColumns\nsplit - it has test(25K) / train(75K) records.\nlabel - Negative(25K)     --\u003e test(12.5K) and train (12.5K)\n        Positive(25K)     --\u003e test(12.5K) and train (12.5K)\n        Unsupervised(50K) --\u003e train(50K)\n\nFor Unsupervised label, reviewer_rating is NaN.\n"
   depends_on = [
     google_bigquery_dataset.imdb
   ]