<a href="https://colab.research.google.com/github/Kira1108/huggingface-examples/blob/main/CustomDatasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output

!pip install transformers datasets

clear_output()

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

--2023-01-09 14:23:25--  https://lazyprogrammer.me/course_files/AirlineTweets.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3030::ac43:d5a6, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3421431 (3.3M) [text/csv]
Saving to: ‘AirlineTweets.csv’


2023-01-09 14:23:26 (5.74 MB/s) - ‘AirlineTweets.csv’ saved [3421431/3421431]



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
# Things we need to do:
# -----------------------------------------------------------------------------
# 1. find 2 columns, 1: text column 2: label column
# 2. encode label column(keep encode map in memory)
# 3. rename columns ['sentence', 'label']
# 4. drop any other columns
# 5. dump encoding map to json
# 6. dump text column name, label column name, 'sentence','label' to json
# 7. write a back transform function(map from interger category to original label)

In [None]:
class CategoricalEncoder:

    def __init__(self, target_map, col, target_name, text_col='text'):
        self.target_map = target_map
        self.col = col
        self.target_name = target_name
        self.text_col = text_col
        self.reverse_map = {v:k for k,v in self.target_map.items()}


    @classmethod
    def from_df(cls, df, col, target_name = "label", text_col = 'text'):
        target_map = {c:i for i,c in enumerate(df[col].unique())}
        return cls(target_map, col, target_name, text_col)

    @classmethod
    def from_json(cls, path = "categorical_encoder.json"):
        return cls(**json.load(open(path,'r')))

    def encode(self, df):
        # step 1 : replace targets with target map
        df[self.target_name] = df[self.col].replace(self.target_map)

        # step 2: remove unused columns
        df.drop(self.col, axis = 1, inplace = True)

        # step 3: rename column to fixed names
        df.rename(columns = {self.text_col:"sentence"},inplace = True)
        return df[['sentence','label']].copy()

    def decode(self, arr):
        return [self.reverse_map.get(c, None) for c in arr]


    def dict(self):
        return {
            "col":self.col,
            "target_name":self.target_name,
            "target_map":self.target_map,
            "text_col":self.text_col
        }

    def json(self):
        return json.dumps(self.dict())

    def dump(self, path = 'categorical_encoder.json'):
        with open(path, "w") as f:
            f.write(self.json())
        print("Dump encoder into", path)

In [None]:
# read dataset, select column to preprocess
df = pd.read_csv("AirlineTweets.csv")
df = df[['airline_sentiment','text']].copy()

# create encoder
encoder = CategoricalEncoder.from_df(
    df, 
    col = 'airline_sentiment',
    target_name = 'label',
    text_col = 'text')

# encode dataframe
encoded_df = encoder.encode(df)

# dump encoder to json
encoder.dump()

Dump encoder into categorical_encoder.json


In [None]:
!cat categorical_encoder.json

{"col": "airline_sentiment", "target_name": "label", "target_map": {"neutral": 0, "positive": 1, "negative": 2}, "text_col": "text"}

In [None]:
newencoder = CategoricalEncoder.from_json("categorical_encoder.json")

In [None]:
encoder.decode([0,2,2,1,2,5])

['neutral', 'negative', 'negative', 'positive', 'negative', None]