In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno

import matplotlib.pyplot as plt
import plotly_express as px
plt.style.use("seaborn-whitegrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import time
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
import re
import string
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MaxAbsScaler

# Word vectors documentation
# https://medium.com/analytics-vidhya/theory-behind-word-embeddings-in-word2vec-858b9350870b
# https://medium.com/analytics-vidhya/glove-theory-and-python-implementation-b706aea28ac1

# How to add GloVe vectors
# https://www.kaggle.com/general/35746

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def clean_text(texto, lemmatize = True):
    texto = " ".join([w.lower() for w in texto.split()])
    texto = re.sub(r"[^A-Za-z0-9]", " ", texto)
    if lemmatize:
        nlp = spacy.load("en_core_web_lg", disable=['parser', 'ner'])
        doc = nlp(texto)
        texto = " ".join([token.lemma_ for token in doc if token.lemma_ != "-PRON-"])
    
    return texto

In [None]:
def top_bottom_k(frame, feature_name = None, feature_value = None, agg_func = np.mean,  top_k = 25):
    """
    Returns a colum from the dataframe cut into 3 categories as belongs to top_k, belongs to bottom k and others
    Returns pd.Series
    """
    
    top_frame = frame.groupby(feature_name).agg({feature_value:agg_func}).nlargest(top_k, columns = feature_value).index.tolist()
    bot_frame = frame.groupby(feature_name).agg({feature_value:agg_func}).nsmallest(top_k, columns = feature_value).index.tolist()
    other_frame = list(set(frame[feature_name].unique()) - (set(top_frame).union(set(bot_frame))))
    
    #print(other_frame)
    
    val_top = ["top_k"]* top_k
    val_bot = ["bot_k"]* top_k
    val_other = ["other"]*len(other_frame)

    
    series = frame[feature_name].replace(dict(zip(top_frame,val_top))).replace(dict(zip(bot_frame,val_bot))).replace(dict(zip(other_frame,val_other)))
    
    
    return series
    

In [None]:
def _get_model_name(model):
    """
        Returns a string with the name of a sklearn model
            model: Sklearn stimator class
    """
    if isinstance(model, Pipeline):
        estimator = model.steps[-1][1]
        name = "Pipeline_" + str(estimator)[:str(estimator).find("(")]
    else: 
        name = str(model)[:str(model).find("(")]
    return name 

def plot_cv_score(X, y, models_list, cv = 5, scoring = None, refit = True, verbose = True):
    """ 
        X: numpy_array/pandas dataframe n_rows, m_features
        y: numpy_array/pandas dataframe n_rows
        Plots min, max and avg kfold crosval_score for a list of models
    
    """

    
    
    names, scores, min_score, max_score, mean_score = list(), list(), list(), list(), list()

    for i, model in enumerate(models_list):
        t0 = time.time()
        name = _get_model_name(model)
        names.append(name)

        if refit:
            model.fit(X, y)
        
        score = cross_val_score(model, X, y, cv = cv, scoring = scoring, n_jobs= -1)

        min_score.append(np.min(score))
        max_score.append(np.max(score))
        mean_score.append(np.mean(score))
        scores.append(score)
        t1 = time.time()
        
        if verbose:
            print(f"Iteration: {i} done in {round((t1-t0)/60,2)} minutes")
            print(f"Mean score for model: {names[i]}: {mean_score[i]}")
        
            
    
    frame_summary = pd.DataFrame({'Min':min_score, 'Average': mean_score, 'Max': max_score,}, index = names).sort_values(by = 'Average')

    frame_scores = pd.DataFrame(np.vstack(scores).T, columns = names) 


    fig, ax  = plt.subplots(1,2, figsize = (15,7))

    frame_summary.plot.barh(edgecolor = 'black', ax = ax[0], cmap = 'RdYlBu')
    ax[0].legend(loc = 'best')
    ax[0].set_xlabel("Score")

    frame_scores.boxplot(ax = ax[1])
    ax[1].set_title("Model scores distribution")
    ax[1].set_ylabel("Score")
    ax[1].tick_params(labelrotation=90)


In [None]:
def plot_importances(estimator, X, y, scoring = None, n_repeats = 5, n_jobs = -1):
    """
    Computes permutation feature importance for a given model
    """
    pimp = permutation_importance(estimator= estimator, X= X, y = y, n_repeats= n_repeats, n_jobs = n_jobs)
    
    df = pd.DataFrame({"Mean performance decrease":pimp.importances_mean}, index = X.columns).sort_values(by = "Mean performance decrease")
    
    fig, ax = plt.subplots(figsize = (10,5))
    
    df.plot.barh(ax = ax, edgecolor = "black", cmap = "RdYlBu")
    ax.set_title("Importances")


### Task 1 Bag of Words and simple Features [50pts]
#### 1.1 Create a baseline model for predicting wine quality using only non-text features.
#### 1.2 Create a simple text-based model using a bag-of-words approach and a linear model.
#### 1.2 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for different feature sets.
##### 1.3 Combine the non-text features and the text features. How does adding those features improve upon just using bag-of-words?

In [None]:
data = pd.read_csv("/kaggle/input/wine-reviews/winemag-data-130k-v2.csv")
data.drop("Unnamed: 0", axis = 1, inplace = True)

In [None]:
data.info()

In [None]:
data.sample(7)

In [None]:
data["train_test"] = np.random.choice(a = ["train", "test"], p = [.7,.3], size = data.shape[0])

In [None]:
Xy_train = data[data.train_test == "train"]
Xy_test = data[data.train_test == "test"]


# EDA

In [None]:
Xy_train.describe().T

In [None]:
missingno.matrix(Xy_train, figsize = (12,7));

In [None]:
Xy_train.hist(figsize = (12,7), edgecolor = "black", color = "darkred", bins = "auto");

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("country").price.mean().sort_values().plot.barh(color = "orange", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average price by country")

Xy_train.boxplot(column = "price", by = "country", ax = ax[1], rot = 90);

In [None]:
Xy_train["country_winery"] = Xy_train["country"] + " | " + Xy_train["winery"]
Xy_train["country_designation"] = Xy_train["country"] + " | " + Xy_train["designation"]

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("country_winery").price.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average price by winery")

Xy_train.groupby("country_winery").price.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average price by winery");
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("country_winery").points.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average points by winery")

Xy_train.groupby("country_winery").points.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average points by winery");
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("designation").price.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average price by winery")

Xy_train.groupby("designation").price.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average price by winery");
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("country_designation").price.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average price by winery")

Xy_train.groupby("country_designation").price.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average price by winery");
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("designation").points.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average points by designation")

Xy_train.groupby("designation").points.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average points by designation");
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,7))
Xy_train.groupby("variety").points.mean().nlargest(25).sort_values().plot.barh(color = "darkred", edgecolor = "black", ax = ax[0])
ax[0].set_title("Average points by variety")

Xy_train.groupby("variety").points.mean().nsmallest(25).sort_values().plot.barh(color = "darkblue", edgecolor = "black", ax = ax[1])
ax[1].set_title("Average points by variety");
plt.tight_layout()

In [None]:
Xy_train.sample(5)

In [None]:
plt.scatter(Xy_train.price, Xy_train.price, alpha = .5, facecolors = "none", edgecolor = "darkred")
plt.xlabel("Price")
plt.ylabel("Points");

#### 1.1 Create a baseline model for predicting wine quality using only non-text features.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBRFRegressor
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import SGDRegressor

In [None]:
Xy_train = Xy_train[["price", "country", "points", "winery", "region_1", "designation", "variety", "province", "taster_name", "description"]]
Xy_test = Xy_test[["price", "country", "points", "winery", "region_1", "designation", "variety", "province", "taster_name", "description"]]

X_train = Xy_train.drop(["points", "description"], axis = 1)
y_train = Xy_train.points

X_test = Xy_test.drop(["points", "description"], axis = 1)
y_test = Xy_test.points

In [None]:
X_train.info()

In [None]:
cont_prepro = Pipeline([("impute",SimpleImputer(strategy = "median")),("scale",StandardScaler()), ("discretizer", KBinsDiscretizer(strategy = "kmeans", n_bins=7))])
cat_prepro = Pipeline([("impute",SimpleImputer(fill_value= "missing", strategy = "constant")),("encoding",OneHotEncoder(handle_unknown = "ignore"))])

preprocessor = make_column_transformer((cat_prepro, make_column_selector(dtype_include = "object")), (cont_prepro, make_column_selector(dtype_exclude = "object")))

pipe_XGB = Pipeline([("preprocessing", preprocessor), ("model", XGBRegressor())]).fit(X_train, y_train)
pipe_RF = Pipeline([("preprocessing", preprocessor), ("model", XGBRFRegressor())]).fit(X_train, y_train)
pipe_Linear = Pipeline([("preprocessing", preprocessor), ("model", SGDRegressor())]).fit(X_train, y_train)

models = [pipe_XGB, pipe_RF, pipe_Linear]

In [None]:
plot_cv_score(X_train, y_train, models)

In [None]:
visualizer = ResidualsPlot(pipe_XGB).fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

In [None]:
plot_importances(pipe_XGB, X_train, y_train, scoring = None, n_repeats = 5, n_jobs = -1)

#### 1.2 Create a simple text-based model using a bag-of-words approach and a linear model.

In [None]:
X_train = Xy_train[["description"]]
y_train = Xy_train.points

X_test = Xy_train[["description"]]
y_test = Xy_train.points


In [None]:
text_vect = Pipeline([("bow",CountVectorizer(min_df = 2, stop_words = ENGLISH_STOP_WORDS))])

preprocessor = make_column_transformer((text_vect, "description"))

pipe_XGB = Pipeline([("preprocessing", preprocessor), ("model", XGBRegressor())]).fit(X_train, y_train)
pipe_RF = Pipeline([("preprocessing", preprocessor), ("model", XGBRFRegressor())]).fit(X_train, y_train)
pipe_Linear = Pipeline([("preprocessing", preprocessor),("scaler", MaxAbsScaler()), ("model", SGDRegressor())]).fit(X_train, y_train)

models = [pipe_Linear, pipe_XGB, pipe_RF ]

In [None]:
plot_cv_score(X_train, y_train, models)

In [None]:
visualizer = ResidualsPlot(pipe_Linear).fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

In [None]:
fig, ax = plt.subplots(1,2, figsize = (12,7))
pd.DataFrame({"Importances":pipe_Linear["model"].coef_}, index = pipe_Linear["preprocessing"].transformers_[0][1][0].get_feature_names()).sort_values(by = "Importances",
ascending = False).nlargest(25, columns = "Importances").plot.barh(edgecolor = "black", color = "darkgreen", ax = ax[0])
ax[0].set_title("top positive feature importances gain Linear transformer")

pd.DataFrame({"Importances":pipe_Linear["model"].coef_}, index = pipe_Linear["preprocessing"].transformers_[0][1][0].get_feature_names()).sort_values(by = "Importances",
ascending = False).nsmallest(25, columns = "Importances").plot.barh(edgecolor = "black", color = "darkred", ax = ax[1])
ax[1].set_title("top negative feature importances Linear transformer");

#### 1.2 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for different feature sets.

In [None]:
text_vect = Pipeline([("bow",TfidfVectorizer(min_df = 2,max_features = 500, stop_words = ENGLISH_STOP_WORDS, ngram_range = (1,2)))])

preprocessor = make_column_transformer((text_vect, "description"))

pipe_XGB = Pipeline([("preprocessing", preprocessor), ("model", XGBRegressor())]).fit(X_train, y_train)
pipe_Linear = Pipeline([("preprocessing", preprocessor),("scaler", MaxAbsScaler()), ("model", SGDRegressor(max_iter = 10000, penalty = "l1"))]).fit(X_train, y_train)

models = [pipe_Linear, pipe_XGB]

In [None]:
plot_cv_score(X_train, y_train, models)

In [None]:
visualizer = ResidualsPlot(pipe_XGB).fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

In [None]:
#using all features
Xy_train = Xy_train[["price", "country", "points", "winery", "region_1", "designation", "variety", "province", "taster_name", "description"]]
Xy_test = Xy_test[["price", "country", "points", "winery", "region_1", "designation", "variety", "province", "taster_name", "description"]]

X_train = Xy_train.drop(["points"], axis = 1)
y_train = Xy_train.points

X_test = Xy_test.drop(["points"], axis = 1)
y_test = Xy_test.points

In [None]:
cont_feat = ["price"]
cat_feat = ["country", "winery", "region_1", "designation", "variety", "province", "taster_name"]
text_feat = "description"

cont_preporcesor = Pipeline([("imputer",SimpleImputer(strategy = "median")),("discretizer", KBinsDiscretizer(strategy = "kmeans", n_bins=7))])
cat_preprocesor = Pipeline([("imputer",SimpleImputer(strategy = "constant", fill_value = "missing")), ("onehot", OneHotEncoder(handle_unknown = "ignore"))])
text_preprocesor = Pipeline([("vectorizer", CountVectorizer(min_df = 2, stop_words = ENGLISH_STOP_WORDS))])

preprocesor = make_column_transformer((cont_preporcesor, cont_feat), (cat_preprocesor, cat_feat), (text_preprocesor, text_feat))

pipe_XGB = Pipeline([("preprocessing", preprocesor), ("model", XGBRegressor())]).fit(X_train, y_train)
pipe_Linear = Pipeline([("preprocessing", preprocesor), ("model", SGDRegressor(max_iter = 10000, penalty = "l1"))]).fit(X_train, y_train)

models = [pipe_XGB, pipe_Linear]

In [None]:
plot_cv_score(X_train, y_train, models)

In [None]:
visualizer = ResidualsPlot(pipe_Linear).fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

In [None]:
# Hiperparameter optimization

cont_feat = ["price"]
cat_feat = ["country", "winery", "region_1", "designation", "variety", "province", "taster_name"]
text_feat = "description"

cont_preporcesor = Pipeline([("imputer",SimpleImputer(strategy = "median")),("discretizer", KBinsDiscretizer(strategy = "kmeans", n_bins=7))])
cat_preprocesor = Pipeline([("imputer",SimpleImputer(strategy = "constant", fill_value = "missing")), ("onehot", OneHotEncoder(handle_unknown = "ignore"))])
text_preprocesor = Pipeline([("vectorizer", CountVectorizer(min_df = 2, stop_words = ENGLISH_STOP_WORDS))])

preprocesor = make_column_transformer((cont_preporcesor, cont_feat), (cat_preprocesor, cat_feat), (text_preprocesor, text_feat))


pipe_Linear = Pipeline([("preprocessing", preprocesor), ("model", SGDRegressor(max_iter = 10000, penalty = "l1"))])


from sklearn.model_selection import RandomizedSearchCV

param_grid = {"preprocessing__pipeline-3__vectorizer__min_df":[1,2,5,7,10],
              "preprocessing__pipeline-1__discretizer__strategy":["kmeans", "quantile"],
              "model__penalty":["l1", "l2"]    
}

pipe = RandomizedSearchCV(estimator= pipe_Linear, param_distributions= param_grid, cv= 5, scoring= None, random_state= 1990,n_jobs= -1, verbose = True).fit(X_train, y_train)

In [None]:
pipe.best_score_

In [None]:
plot_cv_score(X_train, y_train, [pipe.best_estimator_],refit = False)

In [None]:
visualizer = ResidualsPlot(pipe.best_estimator_).fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

In [None]:
from yellowbrick.model_selection import LearningCurve

In [None]:
visualizer =LearningCurve(pipe.best_estimator_).fit(X_train, y_train)
visualizer.show();

### Task 2 Word Vectors [50pts]
Use a pretrained word-embedding (word2vec, glove or fasttext) for featurization instead of the
bag-of-words model. Does this improve classification? How about combining the embedded
words with the BoW model?

In [None]:
def embeddings_dataframe(X_train, X_test,text_column = None, embeddings_prefix = "emb_"):
    train_text = X_train[text_column]
    test_text = X_test[text_column]
    all_text = pd.concat([train_text, test_text])

    print("Checkpoint1 - Data Read Complete")

    embeddings_index = {}
    f = open('/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', encoding="utf8")
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except ValueError:
            pass
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    # this function creates a normalized vector for the whole sentence
    def sent2vec(s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in ENGLISH_STOP_WORDS]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(embeddings_index[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        if type(v) != np.ndarray:
            return np.zeros(100)
        return v / np.sqrt((v ** 2).sum())

    # create sentence vectors using the above function for training and validation set
    xtrain_glove = [sent2vec(x) for x in tqdm(train_text)]
    xtest_glove = [sent2vec(x) for x in tqdm(test_text)]

    print('Checkpoint2 -Normalized Vector for Sentences are created')

    xtrain_glove = np.array(xtrain_glove)
    xtest_glove = np.array(xtest_glove)
    
    xtrain_glove = pd.DataFrame(xtrain_glove, columns = [embeddings_prefix + str(i) for i in range(xtrain_glove.shape[1])])
    xtest_glove = pd.DataFrame(xtest_glove, columns = [embeddings_prefix + str(i) for i in range(xtest_glove.shape[1])])
    
    X_train = pd.concat([X_train.reset_index(drop = True), xtrain_glove], axis = 1)
    X_test = pd.concat([X_test.reset_index(drop = True), xtest_glove], axis = 1)
    
    return (X_train, X_test)
    
    

In [None]:
X_train_emb, X_test_emb = embeddings_dataframe(X_train, X_test, "description")

In [None]:
from sklearn.compose import make_column_selector

In [None]:
cont_feat = ["price"]
# emb_feat = [x for x in X_train_emb.columns if "emb" in x]
cat_feat = ["country", "winery", "region_1", "designation", "variety", "province", "taster_name"]
text_feat = "description"

cont_preporcesor = Pipeline([("imputer",SimpleImputer(strategy = "median")),("discretizer", KBinsDiscretizer(strategy = "kmeans", n_bins=7))])
emb_preprocesor = Pipeline([("scaler", MaxAbsScaler())])
cat_preprocesor = Pipeline([("imputer",SimpleImputer(strategy = "constant", fill_value = "missing")), ("onehot", OneHotEncoder(handle_unknown = "ignore"))])
text_preprocesor = Pipeline([("vectorizer", CountVectorizer(min_df = 2, stop_words = ENGLISH_STOP_WORDS))])

preprocesor = make_column_transformer((cont_preporcesor, cont_feat), (cat_preprocesor, cat_feat), (text_preprocesor, text_feat), remainder = "passthrough")

pipe_XGB = Pipeline([("preprocessing", preprocesor), ("model", XGBRegressor())]).fit(X_train, y_train).fit(X_train_emb, y_train)
pipe_Linear = Pipeline([("preprocessing", preprocesor), ("model", SGDRegressor(max_iter = 10000, penalty = "l1"))]).fit(X_train_emb, y_train)

models = [pipe_XGB, pipe_Linear]

In [None]:
plot_cv_score(X_train_emb, y_train,models, refit = False)

### Task 3 Transformers (bonus / optional) [50pts]
Fine-tune a BERT model on the text data alone using the transformers library.
How does this model compare to a BoW model, and how does it compare to a model using all
features?

In [None]:
# https://medium.com/swlh/transformer-fine-tuning-for-sentiment-analysis-c000da034bb5
# https://towardsml.com/2019/09/17/bert-explained-a-complete-guide-with-theory-and-tutorial/

In [None]:
# ! pip install torch==1.1.0 pytorch-transformers pytorch-ignite

In [None]:
labels = list(set(X_train..tolist()))

# labels to integers mapping
label2int = {label: i for i, label in enumerate(labels)}

In [None]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import numpy as np
import warnings
from tqdm import tqdm_notebook as tqdm
from typing import Tuple

NUM_MAX_POSITIONS = 256
BATCH_SIZE = 32

class TextProcessor: 
    # special tokens for classification and padding
    CLS = '[CLS]'
    PAD = '[PAD]'
    
    def __init__(self, tokenizer, label2id: dict, num_max_positions:int=512):
        self.tokenizer=tokenizer
        self.label2id = label2id
        self.num_labels = len(label2id)
        self.num_max_positions = num_max_positions     
    
    def process_example(self, example: Tuple[str, str]):
        "Convert text (example[0]) to sequence of IDs and label (example[1] to integer"
        assert len(example) == 2
        label, text = example[0], example[1]
        assert isinstance(text, str)
        tokens = self.tokenizer.tokenize(text)

        # truncate if too long
        if len(tokens) >= self.num_max_positions:
            tokens = tokens[:self.num_max_positions-1] 
            ids =  self.tokenizer.convert_tokens_to_ids(tokens) + [self.tokenizer.vocab[self.CLS]]
        # pad if too short
        else:
            pad = [self.tokenizer.vocab[self.PAD]] * (self.num_max_positions-len(tokens)-1)
            ids =  self.tokenizer.convert_tokens_to_ids(tokens) + [self.tokenizer.vocab[self.CLS]] + pad
        
        return ids, self.label2id[label]

# download the 'bert-base-cased' tokenizer
from pytorch_transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

# initialize a TextProcessor
processor = TextProcessor(tokenizer, label2int, num_max_positions=NUM_MAX_POSITIONS)