In [1]:
! pip install datasets transformers accelerate peft

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [

## Datasets

* Text classification - dair.ai/emotion
* Question and Answer - mlqa.hi.en
* Paraphrasing - paws
* Code to Text Translation - code_x_glue_ct_code_to_text
* Summarization - samsum
* Reasoning - gsm8k

## Text Classification Dataset

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

In [3]:
text_classification_data = load_dataset("dair-ai/emotion", split="train")
text_classification_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [4]:
text_classification_df = text_classification_data.to_pandas()
text_classification_df

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [5]:
#mapping here manually from the map given on the Huggingface website

map_emotions = {
    0 : "sadness",
    1 : "joy",
    2 : "love",
    3 : "anger",
    4 : "fear",
    5 : "surprise"
}

text_classification_df["target"]=text_classification_df["label"].apply(lambda x:map_emotions[x])

In [6]:
text_classification_df["source"]=text_classification_df["text"].apply(lambda x:"text-classification-emotion : " + x)
text_classification_df=text_classification_df[["source","target"]]

text_classification_df

Unnamed: 0,source,target
0,text-classification-emotion : i didnt feel hum...,sadness
1,text-classification-emotion : i can go from fe...,sadness
2,text-classification-emotion : im grabbing a mi...,anger
3,text-classification-emotion : i am ever feelin...,love
4,text-classification-emotion : i am feeling gro...,anger
...,...,...
15995,text-classification-emotion : i just had a ver...,sadness
15996,text-classification-emotion : i am now turning...,sadness
15997,text-classification-emotion : i feel strong an...,joy
15998,text-classification-emotion : i feel like this...,anger


In [7]:
text_classification_df = text_classification_df.sample(5000)
#reducing the size a bit, later we can increase if we like

## Question and Answering Dataset

In [8]:
data = load_dataset("mlqa","mlqa.hi.en",split="test")
data

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4918 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/507 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'question', 'answers', 'id'],
    num_rows: 4918
})

In [9]:
qa_df = data.to_pandas()
qa_df

Unnamed: 0,context,question,answers,id
0,"उसी ""एरिया XX "" नामकरण प्रणाली का प्रयोग नेवाद...",Where does the Groom Lake Road head relative t...,"{'answer_start': [378], 'text': ['उत्तर पूर्व']}",eeb8dbd25efe5221dc6723ddee95daa07d2c8478
1,"में खानों की ओर जाती थीं, लेकिन उनके बंद होने ...",What type of roads lead to the ranches?,"{'answer_start': [308], 'text': ['डर्ट-रोड']}",ba7865d50777f2b90ba88fcb070a672d042b6b69
2,विश्व युद्ध II के दौरान ग्रूम झील का प्रयोग बम...,what made the testing strip for the aircraft?,"{'answer_start': [237], 'text': ['झील की सतह']}",2079cf7ce47961738e4bd0d527d0b1058210f869
3,लॉकहीड ने इस स्थल पर एक अस्थायी अड्डे का निर्म...,who was along for the initial U-2 delivery?,"{'answer_start': [330], 'text': ['लॉकहीड विशेष...",d5377da63e6f64dae5e269290a6334c2a912cb3f
4,"अधिकांश नेल्लिस सीमा के विपरीत, झील के आस-पास ...",What kind of action do military pilots face if...,"{'answer_start': [366], 'text': ['अनुशासनात्मक']}",03df1f92420416844575cfa201ae840319c40650
...,...,...,...,...
4913,"सिडनी शेल्डन (11 फरवरी,1917 - 30 जनवरी 2007) ए...",What was the nationality of Sidney Sheldon?,"{'answer_start': [48], 'text': ['अमेरिकी']}",0c0d1225c0847fafbb5fd691fe4f57a61e3ff515
4914,राज्यों को काउंटियों या काउंटी-समकक्ष में विभा...,"Who, in the past, has had the main jurisdictio...","{'answer_start': [590], 'text': ['राज्य']}",a77d373b680265ba082a4c5d83605756a9984fa6
4915,"89 वें अकादमी पुरस्कार (ऑस्कर 2017) समारोह, मो...",What day did the Awards take place?,"{'answer_start': [141], 'text': ['26 फरवरी, 20...",6bbd0d625f1c8077d2b32e2d23baafa3f54ebb9b
4916,डीज़ल उत्सर्जन तरल (अंग्रेजी:Diesel exhaust fl...,What does DEF stand for?,"{'answer_start': [19], 'text': ['(अंग्रेजी:Die...",63a97bd8e73ad871276245f311d227e717c4c94e


In [12]:
qa_df["source"] = qa_df.apply(lambda x: f"context-question-answering : context: {x['context']} question: {x['question']}", axis=1)
qa_df["target"] = qa_df["answers"].apply(lambda x: x["text"][0])
qa_df = qa_df[["source","target"]]

qa_df

Unnamed: 0,source,target
0,"context-question-answering : context: उसी ""एरि...",उत्तर पूर्व
1,context-question-answering : context: में खानो...,डर्ट-रोड
2,context-question-answering : context: विश्व यु...,झील की सतह
3,context-question-answering : context: लॉकहीड न...,लॉकहीड विशेषज्ञों
4,context-question-answering : context: अधिकांश ...,अनुशासनात्मक
...,...,...
4913,context-question-answering : context: सिडनी शे...,अमेरिकी
4914,context-question-answering : context: राज्यों ...,राज्य
4915,context-question-answering : context: 89 वें अ...,"26 फरवरी, 2017"
4916,context-question-answering : context: डीज़ल उत...,(अंग्रेजी:Diesel exhaust fluid


## Paraphrasing Dataset

In [13]:
data = load_dataset("paws","labeled_final",split="train")
trans_df = data.to_pandas()
trans_df = trans_df.sample(5000)

trans_df

Downloading readme:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Unnamed: 0,id,sentence1,sentence2,label
35376,35377,"Until his death in 1996 , Amos Tversky was mar...",Amos Tversky was married to fellow prominent p...,1
7955,7956,The Frank Herzberg Trio is a contemporary Braz...,The Frank Herzberg Trio is a contemporary Braz...,0
48183,48184,"Besides Lena Olin , several other actresses po...","Besides Irina , several other actresses Lena O...",0
25222,25223,"For two years , Kokomo Jr. was used to perform...","For two years , Kokomo Jr. was used to present...",0
30753,30754,The 1988 -- 89 National Basketball Association...,The 1988 season -- 89 National Basketball Asso...,1
...,...,...,...,...
40315,40316,See ETA ( parallel group ) for more extensive ...,For a more detailed discussion of ETA ( pm ) a...,1
21682,21683,Regressive assimilations are conditioned only ...,Regressive assimilations are caused only by ph...,0
10838,10839,Multilayered dinosaurs - eggs are known in ord...,Multilayered dinosaurs - eggs are known in the...,1
8725,8726,"It was born on 18 April 1976 in Usera , Spain ...","She was born in Usera , Madrid ( Spain ) on Ap...",1


In [14]:
#Since we only want data that has a heading of 1 and not 0, we will select ones alone

trans_df = trans_df[trans_df["label"]==1]
trans_df = trans_df[["sentence1","sentence2"]]
trans_df

Unnamed: 0,sentence1,sentence2
35376,"Until his death in 1996 , Amos Tversky was mar...",Amos Tversky was married to fellow prominent p...
30753,The 1988 -- 89 National Basketball Association...,The 1988 season -- 89 National Basketball Asso...
32488,"The other stations include Peelamedu , Singana...","Other stations include Peelamedu , Singanallur..."
22225,Many agencies of the central government are lo...,Many central government agencies are located i...
14369,She also had a side role in the film `` Bob Ro...,She also had a supporting role in the 1992 fil...
...,...,...
31517,The Mornington House was the Georgian residenc...,Mornington House was the Dublin social season ...
40315,See ETA ( parallel group ) for more extensive ...,For a more detailed discussion of ETA ( pm ) a...
10838,Multilayered dinosaurs - eggs are known in ord...,Multilayered dinosaurs - eggs are known in the...
8725,"It was born on 18 April 1976 in Usera , Spain ...","She was born in Usera , Madrid ( Spain ) on Ap..."


In [15]:
trans_df["source"] = trans_df["sentence1"].apply(lambda x: "paraphrase : "+x)
trans_df["target"] = trans_df["sentence2"]
trans_df = trans_df[["source","target"]]

trans_df

Unnamed: 0,source,target
35376,"paraphrase : Until his death in 1996 , Amos Tv...",Amos Tversky was married to fellow prominent p...
30753,paraphrase : The 1988 -- 89 National Basketbal...,The 1988 season -- 89 National Basketball Asso...
32488,paraphrase : The other stations include Peelam...,"Other stations include Peelamedu , Singanallur..."
22225,paraphrase : Many agencies of the central gove...,Many central government agencies are located i...
14369,paraphrase : She also had a side role in the f...,She also had a supporting role in the 1992 fil...
...,...,...
31517,paraphrase : The Mornington House was the Geor...,Mornington House was the Dublin social season ...
40315,paraphrase : See ETA ( parallel group ) for mo...,For a more detailed discussion of ETA ( pm ) a...
10838,paraphrase : Multilayered dinosaurs - eggs are...,Multilayered dinosaurs - eggs are known in the...
8725,paraphrase : It was born on 18 April 1976 in U...,"She was born in Usera , Madrid ( Spain ) on Ap..."


## Code to Text Translation Dataset

In [16]:
data = load_dataset("code_x_glue_ct_code_to_text","python",split="train")
translation_df = data.to_pandas()

Downloading readme:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/144M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/251820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14918 [00:00<?, ? examples/s]

In [17]:
translation_df=translation_df.sample(5000)

translation_df

Unnamed: 0,id,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url
46353,46353,royi1000/py-libhdate,hdate/zmanim.py,Zmanim.get_utc_sun_time_full,"def get_utc_sun_time_full(self):\n """"""R...",python,"def get_utc_sun_time_full(self):\n """"""R...","[def, get_utc_sun_time_full, (, self, ), :, # ...",Return a list of Jewish times for the given lo...,"[Return, a, list, of, Jewish, times, for, the,...",12af759fb69f1d6403abed3762beaf5ace16a34b,https://github.com/royi1000/py-libhdate/blob/1...
76940,76940,coursera-dl/coursera-dl,coursera/utils.py,clean_filename,"def clean_filename(s, minimal_change=False):\n...",python,"def clean_filename(s, minimal_change=False):\n...","[def, clean_filename, (, s, ,, minimal_change,...",Sanitize a string to be used as a filename.\n\...,"[Sanitize, a, string, to, be, used, as, a, fil...",9b434bcf3c4011bf3181429fe674633ae5fb7d4d,https://github.com/coursera-dl/coursera-dl/blo...
20163,20163,robotools/fontParts,Lib/fontParts/base/normalizers.py,normalizeFileFormatVersion,"def normalizeFileFormatVersion(value):\n """"...",python,"def normalizeFileFormatVersion(value):\n """"...","[def, normalizeFileFormatVersion, (, value, ),...",Normalizes a font's file format version.\n\n ...,"[Normalizes, a, font, s, file, format, version...",d2ff106fe95f9d566161d936a645157626568712,https://github.com/robotools/fontParts/blob/d2...
50275,50275,getfleety/coralillo,coralillo/utils.py,snake_case,def snake_case(string):\n ''' Takes a strin...,python,def snake_case(string):\n ''' Takes a strin...,"[def, snake_case, (, string, ), :, s1, =, re, ...",Takes a string that represents for example a c...,"[Takes, a, string, that, represents, for, exam...",9cac101738a0fa7c1106f129604c00ef703370e1,https://github.com/getfleety/coralillo/blob/9c...
244406,244406,jssimporter/python-jss,jss/distribution_point.py,DistributionServer.delete_with_casper_admin_save,"def delete_with_casper_admin_save(self, pkg):\...",python,"def delete_with_casper_admin_save(self, pkg):\...","[def, delete_with_casper_admin_save, (, self, ...",Delete a pkg from the distribution server.\n\n...,"[Delete, a, pkg, from, the, distribution, serv...",b95185d74e0c0531b0b563f280d4129e21d5fe5d,https://github.com/jssimporter/python-jss/blob...
...,...,...,...,...,...,...,...,...,...,...,...,...
149659,149659,broadinstitute/fiss,firecloud/api.py,delete_repository_method,"def delete_repository_method(namespace, name, ...",python,"def delete_repository_method(namespace, name, ...","[def, delete_repository_method, (, namespace, ...",Redacts a method and all of its associated con...,"[Redacts, a, method, and, all, of, its, associ...",dddf91547479506dbbafb69ec84d44dcc4a94ab4,https://github.com/broadinstitute/fiss/blob/dd...
89195,89195,websocket-client/websocket-client,websocket/_core.py,WebSocket.send_close,"def send_close(self, status=STATUS_NORMAL, rea...",python,"def send_close(self, status=STATUS_NORMAL, rea...","[def, send_close, (, self, ,, status, =, STATU...",send close data to the server.\n\n stat...,"[send, close, data, to, the, server, .]",3c25814664fef5b78716ed8841123ed1c0d17824,https://github.com/websocket-client/websocket-...
14394,14394,quintusdias/glymur,glymur/lib/openjp2.py,set_decode_area,"def set_decode_area(codec, image, start_x=0, s...",python,"def set_decode_area(codec, image, start_x=0, s...","[def, set_decode_area, (, codec, ,, image, ,, ...",Wraps openjp2 library function opj_set_decode ...,"[Wraps, openjp2, library, function, opj_set_de...",8b8fb091130fff00f1028dc82219e69e3f9baf6d,https://github.com/quintusdias/glymur/blob/8b8...
38806,38806,twisted/mantissa,xmantissa/scrolltable.py,InequalityModel.rowsAfterValue,"def rowsAfterValue(self, value, count):\n ...",python,"def rowsAfterValue(self, value, count):\n ...","[def, rowsAfterValue, (, self, ,, value, ,, co...",Retrieve some rows at or after a given sort-co...,"[Retrieve, some, rows, at, or, after, a, given...",53e5502aba23ce99be78b27f923a276593033fe8,https://github.com/twisted/mantissa/blob/53e55...


In [18]:
translation_df.iloc[1]["docstring"]

"Sanitize a string to be used as a filename.\n\n    If minimal_change is set to true, then we only strip the bare minimum of\n    characters that are problematic for filesystems (namely, ':', '/' and\n    '\\x00', '\\n')."

In [19]:
translation_df["source"] = translation_df["original_string"].apply(lambda x: "explain what this code does: \n" + x)
translation_df["target"] = translation_df["docstring_tokens"].apply(lambda tokens: " ".join(tokens))
translation_df = translation_df[["source", "target"]]

translation_df

Unnamed: 0,source,target
46353,explain what this code does: \ndef get_utc_sun...,Return a list of Jewish times for the given lo...
76940,explain what this code does: \ndef clean_filen...,Sanitize a string to be used as a filename .
20163,explain what this code does: \ndef normalizeFi...,Normalizes a font s file format version .
50275,explain what this code does: \ndef snake_case(...,Takes a string that represents for example a c...
244406,explain what this code does: \ndef delete_with...,Delete a pkg from the distribution server .
...,...,...
149659,explain what this code does: \ndef delete_repo...,Redacts a method and all of its associated con...
89195,explain what this code does: \ndef send_close(...,send close data to the server .
14394,explain what this code does: \ndef set_decode_...,Wraps openjp2 library function opj_set_decode ...
38806,explain what this code does: \ndef rowsAfterVa...,Retrieve some rows at or after a given sort - ...


In [20]:
translation_df.iloc[5]["target"]

'Returns True if c is an uppercase letter a lowercase letter a digit or an underscore otherwise False .'

In [21]:
translation_df.iloc[15]["target"]

'Evaluate for validation data .'

## Summarization Dataset

In [22]:
! pip install py7zr

Collecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.8/413.8 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyp

In [23]:
data = load_dataset("samsum", split="test")
summ_df = data.to_pandas()

summ_df.size

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

2457

In [24]:
summ_df["source"] = summ_df["dialogue"].apply(lambda x: "conversation-summarization: " + x)
summ_df["target"] = summ_df["summary"]
summ_df = summ_df[["source", "target"]]

summ_df

Unnamed: 0,source,target
0,"conversation-summarization: Hannah: Hey, do yo...",Hannah needs Betty's number but Amanda doesn't...
1,conversation-summarization: Eric: MACHINE!\r\n...,Eric and Rob are going to watch a stand-up on ...
2,"conversation-summarization: Lenny: Babe, can y...",Lenny can't decide which trousers to buy. Bob ...
3,"conversation-summarization: Will: hey babe, wh...",Emma will be home soon and she will let Will k...
4,"conversation-summarization: Ollie: Hi , are yo...",Jane is in Warsaw. Ollie and Jane has a party....
...,...,...
814,conversation-summarization: Alex: Were you abl...,Benjamin didn't come to see a basketball game ...
815,conversation-summarization: Jamilla: remember ...,The audition starts at 7.30 P.M. in Antena 3.
816,conversation-summarization: Marta: <file_gif>\...,"Marta sent a file accidentally,"
817,conversation-summarization: Cora: Have you hea...,There was a meet-and-greet with James Charles ...


## Reasoning Dataset

In [25]:
data = load_dataset("gsm8k","main",split="train")
reasoning_df = data.to_pandas()
reasoning_df = reasoning_df.sample(5000)

reasoning_df

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Unnamed: 0,question,answer
645,"In the city, there is a block of flats. This b...",Half of the floors have different numbers of a...
6953,Haleigh decides that instead of throwing away ...,She gets 2 ounces of wax from a 20 ounce candl...
4237,Silvio was running a race that had 4 parts. Th...,15.5 + (2 * 21.5) = <<15.5+(2*21.5)=58.5>>58.5...
2267,A row of houses all share a community space wh...,"From the children, there are a total of 11 chi..."
1725,Mike has to get an x-ray and an MRI. The x-ra...,The MRI cost 250*3=$<<250*3=750>>750\nSo betwe...
...,...,...
611,Lard decides to share a pizza with his friend ...,"First, we need to determine how many slices th..."
3922,A car manufacturing company that produces 100 ...,Let Y be the number of cars needed to be added...
7438,"Logan makes $65,000 a year. He spends $20,000 ...",Logan spends 20000+5000+8000 = <<20000+5000+80...
6732,Nancy's ex owes her child support. He's suppos...,First find how much money the ex made during t...


In [26]:
reasoning_df.iloc[0]["answer"]

'Half of the floors have different numbers of apartments. Half the number of floors is 12 floors / 2 = <<12/2=6>>6 floors.\nOne half of the block consists of 6 floors * 6 apartments/floor = <<6*6=36>>36 apartments.\nAnd the other half consists of 6 floors * 5 apartments/floor = <<6*5=30>>30 apartments.\nThere are 30 + 36 = <<30+36=66>>66 apartments on all floors in the block.\nAll the apartments can therefore accommodate 66 apartments * 4 residents/apartment = <<66*4=264>>264 residents.\n#### 264'

In [27]:
reasoning_df["source"] = reasoning_df["question"].apply(lambda x: "Given the following scenario, provide a reasoned explanation or conclusion: " + x)
reasoning_df["target"] = reasoning_df["answer"].apply(lambda x: x.replace('<<', '(').replace('>>', ')').replace('\n', '').replace('#', ''))
reasoning_df = reasoning_df[["source","target"]]

reasoning_df

Unnamed: 0,source,target
645,"Given the following scenario, provide a reason...",Half of the floors have different numbers of a...
6953,"Given the following scenario, provide a reason...",She gets 2 ounces of wax from a 20 ounce candl...
4237,"Given the following scenario, provide a reason...",15.5 + (2 * 21.5) = (15.5+(2*21.5)=58.5)58.5 k...
2267,"Given the following scenario, provide a reason...","From the children, there are a total of 11 chi..."
1725,"Given the following scenario, provide a reason...",The MRI cost 250*3=$(250*3=750)750So between t...
...,...,...
611,"Given the following scenario, provide a reason...","First, we need to determine how many slices th..."
3922,"Given the following scenario, provide a reason...",Let Y be the number of cars needed to be added...
7438,"Given the following scenario, provide a reason...",Logan spends 20000+5000+8000 = (20000+5000+800...
6732,"Given the following scenario, provide a reason...",First find how much money the ex made during t...


In [28]:
reasoning_df.iloc[0]["target"]

'Half of the floors have different numbers of apartments. Half the number of floors is 12 floors / 2 = (12/2=6)6 floors.One half of the block consists of 6 floors * 6 apartments/floor = (6*6=36)36 apartments.And the other half consists of 6 floors * 5 apartments/floor = (6*5=30)30 apartments.There are 30 + 36 = (30+36=66)66 apartments on all floors in the block.All the apartments can therefore accommodate 66 apartments * 4 residents/apartment = (66*4=264)264 residents. 264'

In [29]:
reasoning_df.iloc[50]["target"]

'After the fire outbreak, the price of RAM got increased by 30/100 * 50 = $(30/100*50=15)15.So before stabilization the price was at 50 + 15 = $(50+15=65)65.After the stabilization, the price fell by 20% from $65, so it fell by 20/100 * 65 = $(20/100*65=13)13.That means the RAM is currently at 65 - 13 = $(65-13=52)52. 52'

## Final Data Preparation

In [30]:
list_of_dfs = [
    text_classification_df,
    qa_df,
    trans_df, #paraphrasing
    translation_df, #translation of python code
    summ_df,
    reasoning_df]

final_df = pd.concat(list_of_dfs, axis=0)
final_df

Unnamed: 0,source,target
9994,text-classification-emotion : i feel so reject...,sadness
1654,text-classification-emotion : i feel im pretty...,joy
3233,text-classification-emotion : ive collected as...,joy
8624,text-classification-emotion : i now regret bec...,joy
4973,text-classification-emotion : i feel that i no...,joy
...,...,...
611,"Given the following scenario, provide a reason...","First, we need to determine how many slices th..."
3922,"Given the following scenario, provide a reason...",Let Y be the number of cars needed to be added...
7438,"Given the following scenario, provide a reason...",Logan spends 20000+5000+8000 = (20000+5000+800...
6732,"Given the following scenario, provide a reason...",First find how much money the ex made during t...


## Finetuning

In [31]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split

In [32]:
train_df, test_df = train_test_split(final_df, test_size=0.1, random_state=0, shuffle=True)
train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

train_data

Dataset({
    features: ['source', 'target', '__index_level_0__'],
    num_rows: 20647
})

## Base Pretrained Model

In [33]:
model_id="google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [34]:
def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer(sample["source"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["target"], max_length=128, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [35]:
train_tokenized_dataset = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
test_tokenized_dataset = test_data.map(preprocess_function, batched=True, remove_columns=test_data.column_names)

print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

Map:   0%|          | 0/20647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2295 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## LoRA Configuration

In [36]:
lora_config = LoraConfig(
 r=8,
 lora_alpha=16,
 lora_dropout=0.1,
 bias="none",
 task_type="SEQ_2_SEQ_LM",
 target_modules=["q", "v"]
)

In [37]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [38]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 785,509,376 || trainable%: 0.3004


In [39]:
model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
         

## Login  

In [45]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training Arguements

In [41]:
output_dir="flan-t5-large-multipurpose"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=6,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="tensorboard",
    push_to_hub = True
)

In [42]:
model.config.use_cache = False

## Data Collator

In [43]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Trainer

In [46]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset
)

In [47]:
trainer.train()

print("Training done successfully.")

peft_save_model_id="flan-t5-large-multipurpose"
trainer.model.save_pretrained(peft_save_model_id, push_to_hub=True)
tokenizer.save_pretrained(peft_save_model_id, push_to_hub=True)
trainer.model.base_model.save_pretrained(peft_save_model_id, push_to_hub=True)

print("Model pushed to hub")

Step,Training Loss
3442,0.8634




spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.14G [00:00<?, ?B/s]