diff --git a/CHANGELOG b/CHANGELOG index 8a87d011587c..47dcdddaba3a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,23 @@ +======== +4.4.0 +======== +---------------- +New Features +---------------- +* Implement a new Zero-Shot Text Classification for BERT annotator called `BertForZeroShotClassification` +* Implement a new ConvNextForImageClassification annotator +* Introducing BART Transformer for text-to-text generation tasks like translation and summarization +* Set custom entity name in Data2Chunk via `setEntityName` param +* Add a new `nerHasNoSchema` param for NerConverter when labels coming from NerDLMOdel and NerCrfModel don't have any schema +---------------- +Bug Fixes & Enhancements +---------------- +* Fix loading `WordEmbeddingsModel` bug when loading a model from S3 via `cache_folder` config +* Fix `WordEmbeddingsModel` bug failing when it's used with `setEnableInMemoryStorage` set to `True` and LightPipeline +* Remove deprecated parameter enablePatternRegex from EntityRulerApproach & EntityRulerModel +* Deprecate Python 3.6 + + ======== 4.3.2 ======== diff --git a/examples/python/annotation/text/english/zero-shot text classification/Zero_Shot_Text_Classification_by_BERT.ipynb b/examples/python/annotation/text/english/zero-shot text classification/Zero_Shot_Text_Classification_by_BERT.ipynb new file mode 100644 index 000000000000..a4a5feba2151 --- /dev/null +++ b/examples/python/annotation/text/english/zero-shot text classification/Zero_Shot_Text_Classification_by_BERT.ipynb @@ -0,0 +1,445 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Zero-Shot Learning in Modern NLP\n", + "### State-of-the-art NLP models for text classification without annotated data\n", + "\n", + ">Natural language processing is a very exciting field right now. In recent years, the community has begun to figure out some pretty effective methods of learning from the enormous amounts of unlabeled data available on the internet. The success of transfer learning from unsupervised models has allowed us to surpass virtually all existing benchmarks on downstream supervised learning tasks. As we continue to develop new model architectures and unsupervised learning objectives, \"state of the art\" continues to be a rapidly moving target for many tasks where large amounts of labeled data are available.\n", + "\n" + ], + "metadata": { + "id": "hQEd60qjN_0t" + } + }, + { + "cell_type": "code", + "source": [ + "# This is only to setup PySpark and Spark NLP on Colab\n", + "!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash" + ], + "metadata": { + "id": "z5EXpkAOFRu9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import sparknlp\n", + "spark = sparknlp.start()" + ], + "metadata": { + "id": "5fyaA76PFZA6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline, PipelineModel\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer().setInputCols(\"document\").setOutputCol(\"token\")\n", + "\n", + "zero_shot_classifier = BertForZeroShotClassification \\\n", + " .pretrained() \\\n", + " .setInputCols([\"document\", \"token\"]) \\\n", + " .setOutputCol(\"class\") \\\n", + " .setCandidateLabels([\"urgent\", \"mobile\", \"travel\", \"movie\", \"music\", \"sport\", \"weather\", \"technology\"])\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " zero_shot_classifier\n", + "])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_N2AoLRkFYdZ", + "outputId": "f1a1e08f-d20d-4efa-e8d4-f38914ab9c69" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "bert_base_cased_zero_shot_classifier_xnli download started this may take some time.\n", + "Approximate size to download 387.7 MB\n", + "[OK!]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "text = [[\"I have a problem with my iphone that needs to be resolved asap!!\"],\n", + " [\"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"],\n", + " [\"I have a phone and I love it!\"],\n", + " [\"I really want to visit Germany and I am planning to go there next year.\"],\n", + " [\"Let's watch some movies tonight! I am in the mood for a horror movie.\"],\n", + " [\"Have you watched the match yesterday? It was a great game!\"],\n", + " [\"We need to harry up and get to the airport. We are going to miss our flight!\"]]\n", + "\n", + "# create a DataFrame in PySpark\n", + "inputDataset = spark.createDataFrame(text, [\"text\"])\n", + "model = pipeline.fit(inputDataset)\n", + "predictionDF = model.transform(inputDataset)" + ], + "metadata": { + "id": "5LKrJRuFNQNs" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictionDF.select(\"document.result\", \"class.result\").show(10, False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BCIiOgJtN83v", + "outputId": "319bc31a-384f-4793-80b3-8e8aa08e159e" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+----------------------------------------------------------------------------------------------------------------+--------+\n", + "|result |result |\n", + "+----------------------------------------------------------------------------------------------------------------+--------+\n", + "|[I have a problem with my iphone that needs to be resolved asap!!] |[mobile]|\n", + "|[Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.]|[mobile]|\n", + "|[I have a phone and I love it!] |[mobile]|\n", + "|[I really want to visit Germany and I am planning to go there next year.] |[travel]|\n", + "|[Let's watch some movies tonight! I am in the mood for a horror movie.] |[movie] |\n", + "|[Have you watched the match yesterday? It was a great game!] |[sport] |\n", + "|[We need to harry up and get to the airport. We are going to miss our flight!] |[urgent]|\n", + "+----------------------------------------------------------------------------------------------------------------+--------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "sample_text = \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"\n", + "\n", + "light_pipeline = LightPipeline(model)\n", + "\n", + "results = light_pipeline.annotate(sample_text)\n", + "results\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MaRW4KPGNkjc", + "outputId": "49c0e99b-da4f-42e2-ba98-960bcdc22cdd" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'document': ['Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.'],\n", + " 'token': ['Last',\n", + " 'week',\n", + " 'I',\n", + " 'upgraded',\n", + " 'my',\n", + " 'iOS',\n", + " 'version',\n", + " 'and',\n", + " 'ever',\n", + " 'since',\n", + " 'then',\n", + " 'my',\n", + " 'phone',\n", + " 'has',\n", + " 'been',\n", + " 'overheating',\n", + " 'whenever',\n", + " 'I',\n", + " 'use',\n", + " 'your',\n", + " 'app',\n", + " '.'],\n", + " 'class': ['mobile']}" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "for tx in text:\n", + " res = light_pipeline.annotate(tx[0])\n", + " print(f\"document: {res['document']} prediction: {res['class']}\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vbg44saZRtnw", + "outputId": "6d16b175-6014-45a3-b7cb-311b840deb72" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "document: ['I have a problem with my iphone that needs to be resolved asap!!'] prediction: ['mobile']\n", + "document: ['Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.'] prediction: ['mobile']\n", + "document: ['I have a phone and I love it!'] prediction: ['mobile']\n", + "document: ['I really want to visit Germany and I am planning to go there next year.'] prediction: ['travel']\n", + "document: [\"Let's watch some movies tonight! I am in the mood for a horror movie.\"] prediction: ['movie']\n", + "document: ['Have you watched the match yesterday? It was a great game!'] prediction: ['sport']\n", + "document: ['We need to harry up and get to the airport. We are going to miss our flight!'] prediction: ['urgent']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Ss4pE3i4S_SU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Multi Label vs. Multi Class\n", + "\n", + "We can use `activation` parameter to set whether or not the result should be multi-class (the sum of all probabilities is `1.0`) or multi-label (each label has a probability between `0.0` to `1.0`)\n", + "\n", + "- multi-class: `softmax` (default)\n", + "- multi-label: `sigmoid`" + ], + "metadata": { + "id": "RfQrLMxATYyY" + } + }, + { + "cell_type": "code", + "source": [ + "zero_shot_classifier\\\n", + " .setCandidateLabels([\"urgent\", \"mobile\", \"travel\", \"movie\", \"music\", \"sport\", \"weather\", \"technology\"])\\\n", + " .setActivation(\"sigmoid\")\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " zero_shot_classifier\n", + "])" + ], + "metadata": { + "id": "zHB6w1-dTaQC" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text = [[\"I have a problem with my iphone that needs to be resolved asap!!\"],\n", + " [\"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"],\n", + " [\"I have a phone and I love it!\"],\n", + " [\"I really want to visit Germany and I am planning to go there next year.\"],\n", + " [\"Let's watch some movies tonight! I am in the mood for a horror movie.\"],\n", + " [\"Have you watched the match yesterday? It was a great game!\"],\n", + " [\"We need to harry up and get to the airport. We are going to miss our flight!\"]]\n", + "\n", + "# create a DataFrame in PySpark\n", + "inputDataset = spark.createDataFrame(text, [\"text\"])\n", + "model = pipeline.fit(inputDataset)\n", + "predictionDF = model.transform(inputDataset)" + ], + "metadata": { + "id": "F0QgxciuUG4V" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictionDF.select(\"document.result\", \"class.result\").show(10, False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VkvAGrbAUJQn", + "outputId": "902a43eb-8c3e-4d63-9273-455460a7e6a4" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+----------------------------------------------------------------------------------------------------------------+-----------------------------------+\n", + "|result |result |\n", + "+----------------------------------------------------------------------------------------------------------------+-----------------------------------+\n", + "|[I have a problem with my iphone that needs to be resolved asap!!] |[urgent, mobile, movie, technology]|\n", + "|[Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.]|[urgent, technology] |\n", + "|[I have a phone and I love it!] |[mobile] |\n", + "|[I really want to visit Germany and I am planning to go there next year.] |[travel] |\n", + "|[Let's watch some movies tonight! I am in the mood for a horror movie.] |[movie] |\n", + "|[Have you watched the match yesterday? It was a great game!] |[sport] |\n", + "|[We need to harry up and get to the airport. We are going to miss our flight!] |[urgent, travel] |\n", + "+----------------------------------------------------------------------------------------------------------------+-----------------------------------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# check the scores\n", + "predictionDF.select(\"class.metadata\").show(10, False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DAcVGaBVUJ4a", + "outputId": "7ffe547b-df85-478f-b214-5e8dde8d5527" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|metadata |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{urgent -> 0.94203824, music -> 0.273361, movie -> 0.8155173, technology -> 0.9432713, travel -> 0.19155306, sport -> 7.5020565E-4, weather -> 0.012860995, mobile -> 0.81718224, sentence -> 0}, {urgent -> 0.94203824, music -> 0.273361, movie -> 0.8155173, technology -> 0.9432713, travel -> 0.19155306, sport -> 7.5020565E-4, weather -> 0.012860995, mobile -> 0.81718224, sentence -> 0}, {urgent -> 0.94203824, music -> 0.273361, movie -> 0.8155173, technology -> 0.9432713, travel -> 0.19155306, sport -> 7.5020565E-4, weather -> 0.012860995, mobile -> 0.81718224, sentence -> 0}, {urgent -> 0.94203824, music -> 0.273361, movie -> 0.8155173, technology -> 0.9432713, travel -> 0.19155306, sport -> 7.5020565E-4, weather -> 0.012860995, mobile -> 0.81718224, sentence -> 0}]|\n", + "|[{urgent -> 0.5989145, music -> 0.009265149, movie -> 0.004853843, technology -> 0.83162993, travel -> 0.2232338, sport -> 0.0011306163, weather -> 0.0057875705, mobile -> 0.39459854, sentence -> 0}, {urgent -> 0.5989145, music -> 0.009265149, movie -> 0.004853843, technology -> 0.83162993, travel -> 0.2232338, sport -> 0.0011306163, weather -> 0.0057875705, mobile -> 0.39459854, sentence -> 0}] |\n", + "|[{urgent -> 0.0024619207, music -> 0.030493928, movie -> 0.10384921, technology -> 0.01819679, travel -> 0.025316363, sport -> 5.448147E-4, weather -> 5.9901236E-4, mobile -> 0.822738, sentence -> 0}] |\n", + "|[{urgent -> 0.0074373055, music -> 0.06391685, movie -> 0.009793872, technology -> 0.08052815, travel -> 0.95946944, sport -> 0.0033297893, weather -> 0.11855726, mobile -> 0.011235076, sentence -> 0}] |\n", + "|[{urgent -> 0.0032734834, music -> 0.00897034, movie -> 0.7762647, technology -> 2.2468095E-4, travel -> 0.0016853365, sport -> 5.059199E-4, weather -> 7.164882E-4, mobile -> 0.0029863138, sentence -> 0}] |\n", + "|[{urgent -> 6.5108406E-4, music -> 0.0819624, movie -> 0.07504165, technology -> 0.0013726762, travel -> 0.0011811191, sport -> 0.9228904, weather -> 0.003503337, mobile -> 0.0048036124, sentence -> 0}] |\n", + "|[{urgent -> 0.9887839, music -> 0.13705872, movie -> 0.3061645, technology -> 0.008889678, travel -> 0.8829653, sport -> 0.0101994965, weather -> 0.105074614, mobile -> 0.011163117, sentence -> 0}, {urgent -> 0.9887839, music -> 0.13705872, movie -> 0.3061645, technology -> 0.008889678, travel -> 0.8829653, sport -> 0.0101994965, weather -> 0.105074614, mobile -> 0.011163117, sentence -> 0}] |\n", + "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Zero-Shot Learning (ZSL)\n", + "> Traditionally, zero-shot learning (ZSL) most often referred to a fairly specific type of task: learn a classifier on one set of labels and then evaluate on a different set of labels that the classifier has never seen before. Recently, especially in NLP, it's been used much more broadly to mean get a model to do something that it wasn't explicitly trained to do. A well-known example of this is in the [GPT-2 paper](https://pdfs.semanticscholar.org/9405/cc0d6169988371b2755e573cc28650d14dfe.pdf) where the authors evaluate a language model on downstream tasks like machine translation without fine-tuning on these tasks directly.\n", + "\n", + "Let's see how easy it is to just use any set of lables our trained model has never seen via `setCandidateLabels()` param:" + ], + "metadata": { + "id": "ICDQOrwPOeu4" + } + }, + { + "cell_type": "code", + "source": [ + "zero_shot_classifier\\\n", + " .setCandidateLabels([\"space & cosmos\", \"scientific discovery\", \"microbiology\", \"robots\", \"archeology\", \"politics\"])\\\n", + " .setActivation(\"sigmoid\") # multi-label\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " zero_shot_classifier\n", + "])\n", + "\n", + "input_text3 = [\n", + " [\"Learn about the presidential election process, including the Electoral College, caucuses and primaries, and the national conventions.\"],\n", + " [\"\"\"In a new book, Sean Carroll brings together physics and philosophy while advocating for \"poetic naturalism.\" Ramin Skibba, Contributor. Space ...\"\"\"],\n", + " [\"Who are you voting for in 2024?\"]]\n", + "\n", + "# create a DataFrame in PySpark\n", + "inputDataset = spark.createDataFrame(input_text3, [\"text\"])\n", + "model = pipeline.fit(inputDataset)\n", + "predictionDF = model.transform(inputDataset)\n", + "\n", + "predictionDF.select(\"document.result\", \"class.result\").show(3, False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JnVfw4T9PWyb", + "outputId": "ee01e459-2a07-4adb-8cf0-625465d4d4c7" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+---------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+\n", + "|result |result |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+\n", + "|[Learn about the presidential election process, including the Electoral College, caucuses and primaries, and the national conventions.] |[politics] |\n", + "|[In a new book, Sean Carroll brings together physics and philosophy while advocating for \"poetic naturalism.\" Ramin Skibba, Contributor. Space ...]|[space & cosmos, scientific discovery]|\n", + "|[Who are you voting for in 2024?] |[politics] |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "pJK49gj_RqOQ" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file