From eecd680af60aa6a1d2b478c92bd1ad925c20bc96 Mon Sep 17 00:00:00 2001
From: Karthik Rangasai <karthikrangasai@gmail.com>
Date: Wed, 15 Dec 2021 17:33:07 +0530
Subject: [PATCH 1/2] Add Flash Question Answering tutorial - Initial Commit.

---
 .../.meta.yml                                 |  18 +++
 .../multilingual_question_answering.py        | 135 ++++++++++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 flash_tutorials/dravidian_languages_question_answering/.meta.yml
 create mode 100644 flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py

diff --git a/flash_tutorials/dravidian_languages_question_answering/.meta.yml b/flash_tutorials/dravidian_languages_question_answering/.meta.yml
new file mode 100644
index 000000000..52da0099a
--- /dev/null
+++ b/flash_tutorials/dravidian_languages_question_answering/.meta.yml
@@ -0,0 +1,18 @@
+title: Question Answering for Dravidian Languages
+author: Karthik Rangasai Sivaraman (karthikrangasai@gmail.com)
+created: 2021-12-15
+updated: 2021-12-15
+license: CC BY-SA
+build: 3
+tags:
+  - Text
+  - Question Answering
+description: |
+  This tutorial covers using Lightning Flash and it's integration with Hugging Face Transformers to train a Transformer
+  model (XLM-RoBERTa) on SQuAD type dataset for the dravidian languages. We show how easy it is to use a Hugging Face
+  Transformers model with the all goodness provided by PyTorch Lightning using Flash.
+requirements:
+  - lightning-flash[text]>=0.5.2
+accelerator:
+  - GPU
+  - CPU
diff --git a/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py b/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py
new file mode 100644
index 000000000..8d32edff9
--- /dev/null
+++ b/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py
@@ -0,0 +1,135 @@
+# %% [markdown]
+# In this tutorial we'll look at using [Lightning Flash](https://github.com/PyTorchLightning/lightning-flash) and it's
+# integration with [Hugging Face Transformers](https://github.com/huggingface/transformers) for question answering of
+# dravidian language based corpus using [the XLM-RoBERTa model](https://arxiv.org/pdf/1911.02116.pdf).
+
+# %%
+
+import os
+
+import pandas as pd
+import torch
+from flash import Trainer
+from flash.core.data.utils import download_data
+from flash.text import QuestionAnsweringData, QuestionAnsweringTask
+
+DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
+
+# %% [markdown]
+# ## Loading the data
+#
+# We'll use the Chaii question answering in Hindi and Tamil dataset from Kaggle:
+# https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering
+#
+# First, download the data:
+
+# %%
+download_data("https://pl-flash-data.s3.amazonaws.com/kaggle_chaii.zip", DATASET_PATH)
+
+# %% [markdown]
+# ## Loading the Data and generating splits
+#
+# To load the data, we start by creating a train, validation, and test splits:
+
+# %%
+INPUT_DATA_PATH = os.path.join(DATASET_PATH, "train.csv")
+TRAIN_DATA_PATH = os.path.join(DATASET_PATH, "_train.csv")
+VAL_DATA_PATH = os.path.join(DATASET_PATH, "_val.csv")
+PREDICT_DATA_PATH = os.path.join(DATASET_PATH, "test.csv")
+
+df = pd.read_csv(INPUT_DATA_PATH)
+fraction = 0.9
+
+tamil_examples = df[df["language"] == "tamil"]
+train_split_tamil = tamil_examples.sample(frac=fraction, random_state=200)
+val_split_tamil = tamil_examples.drop(train_split_tamil.index)
+
+hindi_examples = df[df["language"] == "hindi"]
+train_split_hindi = hindi_examples.sample(frac=fraction, random_state=200)
+val_split_hindi = hindi_examples.drop(train_split_hindi.index)
+
+train_split = pd.concat([train_split_tamil, train_split_hindi]).reset_index(drop=True)
+val_split = pd.concat([val_split_tamil, val_split_hindi]).reset_index(drop=True)
+
+train_split.to_csv(TRAIN_DATA_PATH, index=False)
+val_split.to_csv(VAL_DATA_PATH, index=False)
+
+# %% [markdown]
+# ## Creating the Flash DataModule
+#
+# Now, we can create a `QuestionAnsweringData`.
+# Flash supports a wide variety of input formats, each having its method with the naming format as `from_xxxx`.
+# Our datasets are available as CSV files, and it is the same format in which we saved the splits. Hence, we use the
+# `from_csv` method to generate the DataModule. The simplest form of the API only requires the data files, the Hugging
+# Face backbone of your choice, and batch size. Flash takes care of preprocessing the data, i.e., tokenizing using the
+# Hugging Face tokenizer and creating the Datasets.
+#
+# Here's the full preprocessing function:
+
+# %%
+
+datamodule = QuestionAnsweringData.from_csv(
+    train_file=TRAIN_DATA_PATH,
+    val_file=VAL_DATA_PATH,
+    batch_size=4,
+    backbone="xlm-roberta-base",
+)
+
+# %% [markdown]
+# ## Creating the Flash Task
+#
+# The API for building the NLP Task is also simple. For all Flash models, the naming pattern follows `XYZTask`, and
+# thus we will be using the `QuestionAnsweringTask` in this case. The power of Flash's simplicity comes into play here
+# as we pass the required backbone, Optimizer of choice, and the preferable learning rate for the model. Then Flash
+# takes care of the rest, i.e., downloading the model, instantiating the model, configuring the Optimizer, and even
+# logging the losses.
+
+# %%
+model = QuestionAnsweringTask(
+    backbone="xlm-roberta-base",
+    learning_rate=1e-5,
+    optimizer="adamw",
+)
+
+# %% [markdown]
+# ## Setting up the Trainer and Fine-Tuning the model
+#
+# Flash's Trainer is inherited from Lightning's Trainer and provides an additional method `finetune` that takes in an
+# extra argument `strategy` that lets us specify a specific strategy for fine-tuning the backbone. We will be using
+# the `freeze_unfreeze` strategy to fine-tune the model, which freezes the gradients of the backbone transformer
+# containing the pre-trained weights and trains just the new model head for a certain number of epochs and unfreezes
+# the backbone after which the complete model (backbone + head) is trained for the remaining epochs.
+#
+# Check out the documentation to learn about the other strategies provided by Flash, and feel free to reach out and
+# contribute any new fine-tuning methods to the project.
+
+# %%
+trainer = Trainer(
+    max_epochs=5,
+    accumulate_grad_batches=2,
+    gpus=int(torch.cuda.is_available()),
+)
+
+trainer.finetune(model, datamodule, strategy=("freeze_unfreeze", 2))
+
+# %% [markdown]
+# ## Making predictions
+#
+# We convert the prediction file provided to us from a pandas DataFrame to a python dictionary object and pass it to
+# the model as predictions.
+
+# %%
+predict_data = pd.read_csv(PREDICT_DATA_PATH)
+predict_data = predict_data[predict_data.columns[:3]].to_dict(orient="list")
+
+predictions = model.predict(predict_data)
+print(predictions)
+
+# %% [markdown]
+# ## Closing thoughts and next steps!
+#
+# This tutorial has shown how Flash and Hugging Face Transformers can be used to train a state-of-the-art language
+# model (such as XLM-RoBERTa).
+#
+# If you want to be a bit more adventurous, you could look at
+# [some of the other problems that can solved with Lightning Flash](https://lightning-flash.readthedocs.io/en/stable/?badge=stable).

From c95130b9fd4a117b9a897888350e3ed20ad3bf31 Mon Sep 17 00:00:00 2001
From: Karthik Rangasai <karthikrangasai@gmail.com>
Date: Mon, 3 Jan 2022 11:45:27 +0530
Subject: [PATCH 2/2] Update dataset link to get from kaggle in meta file.

---
 .../.meta.yml                                 |  3 +++
 .../multilingual_question_answering.py        | 24 +++++--------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/flash_tutorials/dravidian_languages_question_answering/.meta.yml b/flash_tutorials/dravidian_languages_question_answering/.meta.yml
index 52da0099a..b2dbe1dfa 100644
--- a/flash_tutorials/dravidian_languages_question_answering/.meta.yml
+++ b/flash_tutorials/dravidian_languages_question_answering/.meta.yml
@@ -16,3 +16,6 @@ requirements:
 accelerator:
   - GPU
   - CPU
+datasets:
+  kaggle:
+    - chaii-hindi-and-tamil-question-answering
diff --git a/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py b/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py
index 8d32edff9..ca8f0b0a9 100644
--- a/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py
+++ b/flash_tutorials/dravidian_languages_question_answering/multilingual_question_answering.py
@@ -10,32 +10,20 @@
 import pandas as pd
 import torch
 from flash import Trainer
-from flash.core.data.utils import download_data
 from flash.text import QuestionAnsweringData, QuestionAnsweringTask
 
-DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
-
-# %% [markdown]
-# ## Loading the data
-#
-# We'll use the Chaii question answering in Hindi and Tamil dataset from Kaggle:
-# https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering
-#
-# First, download the data:
-
-# %%
-download_data("https://pl-flash-data.s3.amazonaws.com/kaggle_chaii.zip", DATASET_PATH)
-
 # %% [markdown]
 # ## Loading the Data and generating splits
 #
 # To load the data, we start by creating a train, validation, and test splits:
 
 # %%
-INPUT_DATA_PATH = os.path.join(DATASET_PATH, "train.csv")
-TRAIN_DATA_PATH = os.path.join(DATASET_PATH, "_train.csv")
-VAL_DATA_PATH = os.path.join(DATASET_PATH, "_val.csv")
-PREDICT_DATA_PATH = os.path.join(DATASET_PATH, "test.csv")
+DATASET_PATH = os.environ.get("PATH_DATASETS", "_datasets")
+CHAII_DATASET_PATH = os.path.join(DATASET_PATH, "chaii-hindi-and-tamil-question-answering")
+INPUT_DATA_PATH = os.path.join(CHAII_DATASET_PATH, "train.csv")
+TRAIN_DATA_PATH = os.path.join(CHAII_DATASET_PATH, "_train.csv")
+VAL_DATA_PATH = os.path.join(CHAII_DATASET_PATH, "_val.csv")
+PREDICT_DATA_PATH = os.path.join(CHAII_DATASET_PATH, "test.csv")
 
 df = pd.read_csv(INPUT_DATA_PATH)
 fraction = 0.9