In [1]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from auxiliar_functions import load_dataset, load_embedding
from IPython.display import clear_output
import pickle
import torch 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertForPreTraining
from torch.nn import BCEWithLogitsLoss
from torch.utils.tensorboard import SummaryWriter
from AsymmetricLoss import AsymmetricLossOptimized
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, accuracy_score
import gc
import nltk


nltk.download('stopwords')
spanish_stopwords = nltk.corpus.stopwords.words('spanish') + ["UNK"]


clear_output()

In [2]:
ORDER = ['conflicto', 'economico', 'humanidad', 'moral']

        
class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, input_mask, segment_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids


def convert_example_to_feature(example_row):
    text, max_seq_length, tokenizer, cls_token, sep_token = example_row
    tokens_a = tokenizer.tokenize(text)
    
    # Account for [CLS] and [SEP] with "- 2"
    special_tokens_count = 2
    tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
    tokens = tokens_a + [sep_token]
    segment_ids = [0] * len(tokens)

    tokens = [cls_token] + tokens
    segment_ids = [0] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    
    input_ids = input_ids + ([0] * padding_length)
    input_mask = input_mask + ([0] * padding_length)
    segment_ids = segment_ids + ([0] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
    )


def convert_examples_to_features(texts, max_seq_length, tokenizer, cls_token, sep_token):
    examples = [(text, max_seq_length, tokenizer, cls_token, sep_token) for text in texts]
    return [convert_example_to_feature(example) for example in examples]


class BetoEmbedding(torch.nn.Module):
    def __init__(self, hidden_size, batch_size=8, take_mean=True,
                 loss_function="cross-entropy", n_epochs=30):
        super().__init__()
        gc.collect()
        torch.cuda.empty_cache()

        self.tokenizer = BertTokenizer.from_pretrained('vocab.txt', keep_accents=True)
        config = BertConfig.from_json_file('bert_config.json')
        config.output_hidden_states = True
        self.bert = BertForPreTraining.from_pretrained('pytorch_model.bin', config=config)
        for name, param in self.bert.named_parameters():
            param.requires_grad = False
            if name.startswith("bert.encoder.layer.11"):
                param.requires_grad = True
        
        self.take_mean = take_mean
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.loss_object = AsymmetricLossOptimized()
        if loss_function == "cross-entropy":
            self.loss_object = torch.nn.BCEWithLogitsLoss()
        
        self.fc1 = torch.nn.Linear(768, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.5)
        self.embedding_dropout = torch.nn.Dropout(0.5)
        self.fc2 = torch.nn.Linear(hidden_size, 4) # Cambiar 
        self.sigmoid = torch.nn.Sigmoid()
        
    def get_embedding(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        # outputs[-1] = salidas de las capas ocultas (hidden_iniciales, capa_1, ..., capa_12)
        outputs = outputs[-1][-1] # Tomar de las hidden_state, la última capa
        if self.take_mean:
            output = torch.mean(outputs, 1)
        else:
            output = outputs[:, 0, :] # Tomar el primer vector (CLS)
        return output
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.get_embedding(input_ids, attention_mask, token_type_ids)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output
  

    def get_proba(self, input_ids, attention_mask, token_type_ids):
        return self.sigmoid(self(input_ids, attention_mask, token_type_ids))
        
        
    def fit(self, X, Y, X_val=None, Y_val=None, writer=None, fold_index=None):
        model = self.cuda(1)
        self.bert.cuda(1)
        tokenizer = self.tokenizer
        
        features = convert_examples_to_features(X, 512, tokenizer, tokenizer.cls_token,tokenizer.sep_token)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)

        train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, torch.Tensor(Y))
        train_dl = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        dataloaders = {"train": train_dl}
        
        if X_val is not None:
            features = convert_examples_to_features(X_val, 512, tokenizer, tokenizer.cls_token,tokenizer.sep_token)
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
            val_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, torch.Tensor(Y_val))
            dataloaders["val"] = DataLoader(val_dataset, batch_size=self.batch_size)
        
        optimizer = AdamW(model.parameters(), lr=0.001)  
        
        for epoch in range(self.n_epochs):
            for step in dataloaders:
                if step == "train":
                    model.train()
                else:
                    model.eval()
                    
                total_muestras = 0.0
                total_correctas = 0.0
                total_loss = 0
                loss_class = [0, 0, 0, 0]
                all_logits = None
                all_target = None

                for batch in dataloaders[step]:
                    optimizer.zero_grad()                    
                    
                    input_ids, attention_mask, token_type_ids, target = [x.cuda(1) for x in batch]
                    target = target.cuda(1)                   
                    logits = model(input_ids, attention_mask, token_type_ids)          

                    loss = self.loss_object(logits, target)
                    total_loss += (loss * (target.shape[0] * target.shape[1])).item()

                    for i in range(target.shape[1]):
                        loss_c = self.loss_object(logits[:, i], target[:, i])
                        loss_class[i] += (loss_c * target.shape[0]).item()

                    preds = logits >= 0.0
                    if all_logits is None:
                        all_logits = preds.detach().cpu().numpy()
                        all_target = target.detach().cpu().numpy()
                    else:
                        all_logits = np.append(all_logits, preds.detach().cpu().numpy(), axis=0)
                        all_target = np.append(all_target, target.detach().cpu().numpy(), axis=0)

                    total_muestras += (target.shape[0]*target.shape[1])       # Sumamos el tamaño del batch
                    
                    if step == "train":
                        loss.backward()                             # Backpropagation
                        optimizer.step()                            # Actualizamos parámetros
                        correctas = (preds == target).sum().item()  # Acumulamos las correctas durante la época
                        total_correctas += correctas               
                        accuracy = total_correctas/total_muestras 
                        
                        print("\rEpoca {}: Loss: {:.4f} Accuracy: {:.2f}%".format(epoch, loss, 100*accuracy),
                              end="")

                save_data(writer, all_logits, all_target, total_loss, loss_class,
                          total_muestras, fold_index, epoch, step)

                    
    def predict_proba(self, X):
        model = self.cuda(1)
        self.bert.cuda(1)
        model.eval()
        self.bert.eval()
        
        tokenizer = self.tokenizer
        features = convert_examples_to_features(X, 512, tokenizer, tokenizer.cls_token,tokenizer.sep_token)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)

        train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
        train_dl = DataLoader(train_dataset, batch_size=1)
        
        all_probs = None
        for batch in train_dl:
            input_ids, attention_mask, token_type_ids = [x.cuda(1) for x in batch]
            probs = model.get_proba(input_ids, attention_mask, token_type_ids)

            if all_probs is None:
                all_probs = probs.detach().cpu().numpy()
            else:
                all_probs = np.append(all_probs, probs.detach().cpu().numpy(), axis=0)

        return all_probs 
    
    def get_all_embeddings(self, X):
        model = self.cuda(1)
        self.bert.cuda(1)
        model.eval()
        self.bert.eval()
        
        tokenizer = self.tokenizer
        features = convert_examples_to_features(X, 512, tokenizer, tokenizer.cls_token,tokenizer.sep_token)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)

        train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
        train_dl = DataLoader(train_dataset, batch_size=1)
        
        all_embedding = None
        for batch in train_dl:
            input_ids, attention_mask, token_type_ids = [x.cuda(1) for x in batch]
            embeddings = model.get_embedding(input_ids, attention_mask, token_type_ids)

            if all_embedding is None:
                all_embedding = embeddings.detach().cpu().numpy()
            else:
                all_embedding = np.append(all_embedding, embeddings.detach().cpu().numpy(), axis=0)

        return all_embedding 

In [3]:
ORDER = ['conflicto', 'economico', 'humanidad', 'moral']
vocab = [x.strip().split(": ")[0] for x in open("vocabulary_corpus_counter.txt", encoding="UTF-8").readlines()]
vocab_to_idx = {"[PAD]":0, "[UNK]":1}
WORDS = ["[PAD]", "[UNK]"]
for word in vocab:
    vocab_to_idx[word] = len(WORDS)
    WORDS.append(word)


class MyTextDataset(Dataset):
    def __init__(self, X, Y=None):
        self.texts = [torch.LongTensor(x) for x in X]
        self.len = len(X)
        if Y is not None:
            self.frames = Y
        else:
            self.frames = [np.array([0]*4) for _ in range(self.len)]
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, item):
        x = self.texts[item]
        y = self.frames[item]
        return x, y
    
    
def my_collate(data_list):
    data_list.sort(key=lambda x: len(x[0]), reverse=True)
    X_seq, Y = zip(*data_list)
    lengths = [len(x) for x in X_seq]
    X = torch.nn.utils.rnn.pad_sequence(X_seq, batch_first=True)
    return ((X, lengths), torch.Tensor(Y))


class BiLSTM(torch.nn.Module):
    def __init__(self, embedding_size, hidden_dim_lstm=50, hidden_size=50,
                 n_epochs=30, process_output="max", batch_size=1, 
                 loss_function="cross-entropy"):
        super().__init__()
        
        self.n_epochs = n_epochs
        self.hidden_dim_lstm = hidden_dim_lstm
        self.process_output = process_output
        self.batch_size = batch_size
        self.loss_object = AsymmetricLossOptimized()
        if loss_function == "cross-entropy":
            self.loss_object = torch.nn.BCEWithLogitsLoss()
    
    
        self.embedding = torch.nn.Embedding(len(WORDS), embedding_size, padding_idx=0)
        self.lstm = torch.nn.LSTM(embedding_size, hidden_dim_lstm, 
                                  num_layers=2, bidirectional=True,
                                  batch_first=True)
        
        self.fc1 = torch.nn.Linear(hidden_dim_lstm*2, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.5)
        self.fc2 = torch.nn.Linear(hidden_size, 4)
        self.sigmoid = torch.nn.Sigmoid()    
        
        
    def forward(self, x):
        output = self.get_embedding(x)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
    
    def get_embedding(self, x):
        x, lengths = x
        output = self.embedding(x)
        
        output = torch.nn.utils.rnn.pack_padded_sequence(output, lengths, batch_first=True)
        lstm_out, (hidden, _) = self.lstm(output) # shape = (batch_size, seq_len, hidden_dim*2)
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, padding_value=0.0)
        output = lstm_out
        
        if self.process_output == "max":    
            output, _ = torch.max(output, 1) # shape = (batch_size, hidden_dim*2)
        elif self.process_output == "mean":
            mask = output != 0.0
            output = (output*mask).sum(dim=1)/mask.sum(dim=1)
        else: # last_state       # layers # direction (2) # batch size, # hidden_dim
            hidden = hidden.view(2, 2, len(lengths), self.hidden_dim_lstm)[-1]
            output = torch.cat([hidden[0], hidden[1]], 1)
            
        return output
    
    
    def get_proba(self, x):
        return self.sigmoid(self(x))
        
        
    def fit(self, X, Y, X_val=None, Y_val=None, writer=None, fold_index=None):
        model = self.cuda(1)
        
        ds_train = MyTextDataset(X, Y)
        train_dl = DataLoader(ds_train, batch_size=self.batch_size, shuffle=True, collate_fn=my_collate)
        dataloaders = {"train": train_dl}
        
        if X_val is not None:
            ds_val = MyTextDataset(X_val, Y_val)    
            dataloaders["val"] = DataLoader(ds_val, batch_size=self.batch_size, collate_fn=my_collate)
        
        loss_object = self.loss_object
        optimizer = AdamW(model.parameters(), lr=0.001)  
        
        for epoch in range(self.n_epochs):
            for step in dataloaders:
                if step == "train":
                    model.train()
                else:
                    model.eval()
                    
                total_muestras = 0.0
                total_correctas = 0.0
                total_loss = 0
                loss_class = [0, 0, 0, 0]
                all_logits = None
                all_target = None

                for (x, length), target in dataloaders[step]:
                    optimizer.zero_grad()                    
                    
                    x = x.cuda(1)
                    target = target.cuda(1)                   
                    logits = model((x, length))          

                    loss = loss_object(logits, target)
                    total_loss += loss * (target.shape[0] * target.shape[1])

                    for i in range(target.shape[1]):
                        loss_c = loss_object(logits[:, i], target[:, i])
                        loss_class[i] += loss_c * target.shape[0]

                    preds = logits >= 0.0
                    if all_logits is None:
                        all_logits = preds.detach().cpu().numpy()
                        all_target = target.detach().cpu().numpy()
                    else:
                        all_logits = np.append(all_logits, preds.detach().cpu().numpy(), axis=0)
                        all_target = np.append(all_target, target.detach().cpu().numpy(), axis=0)

                    total_muestras += (target.shape[0]*target.shape[1])
                    
                    if step == "train":
                        loss.backward()                             
                        optimizer.step()                            
                        correctas = (preds == target).sum().item() 
                        total_correctas += correctas               
                        accuracy = total_correctas/total_muestras 
                        
                        print("\rEpoca {}: Loss: {:.4f} Accuracy: {:.2f}%".format(epoch, loss, 100*accuracy),
                              end="")

                save_data(writer, all_logits, all_target, total_loss, loss_class,
                          total_muestras, fold_index, epoch, step)
                 
                    
    def predict_proba(self, X):
        model = self.cuda(1)
        model.eval()
        
        ds_test = MyTextDataset(X)    
        test_dl = DataLoader(ds_test, batch_size=1, collate_fn=my_collate)
        all_probs = None
        for (x, length), _ in test_dl:
            x = x.cuda(1)                
            probs = self.get_proba((x, length))

            if all_probs is None:
                all_probs = probs.detach().cpu().numpy()
            else:
                all_probs = np.append(all_probs, probs.detach().cpu().numpy(), axis=0)

        return all_probs 
    
    def get_all_embeddings(self, X):
        model = self.cuda(1)
        model.eval()
        
        ds_test = MyTextDataset(X)    
        test_dl = DataLoader(ds_test, batch_size=1, collate_fn=my_collate)
        all_embedding = None
        for (x, length), _ in test_dl:
            x = x.cuda(1)                
            embeddings = self.get_embedding((x, length))

            if all_embedding is None:
                all_embedding = embeddings.detach().cpu().numpy()
            else:
                all_embedding = np.append(all_embedding, embeddings.detach().cpu().numpy(), axis=0)

        return all_embedding 

In [4]:
df = load_dataset("preprocess_dataset.npy")
Y_true = np.array([np.array(x) for x in df.frames])

df.head()

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0
3,La familia maratón que causó revuelo en Holand...,la familia maratón que causó revuelo en holand...,"[9999, 9668, 5417, 9996, 7388, 2016, 9997, 887...","[0, 0, 1, 0]",0,0,1,0
4,Crean sitio web que recopila mangas descontin...,crean sitio web que [UNK] [UNK] [UNK] para [UN...,"[2420, 9319, 9360, 9996, 1, 1, 1, 9985, 1, 998...","[0, 1, 0, 0]",0,1,0,0


In [None]:
np.random.seed(4444)
k_fold = IterativeStratification(n_splits=10, order=1)
encoded = np.array([np.array(x) for x in df.encoded.values], dtype=object)
others_embeddings = {}

for mode in ["fasttext", "elmo", "beto_embedding_mean", "beto_embedding_cls"]:
    others_embeddings[mode] = load_embedding(mode, "dataset_1")
    
models = [
    ["Bi-LSTM", BiLSTM, {'embedding_size': 200, 'hidden_size': 200, 'hidden_dim_lstm': 300, 'n_epochs': 6, 'process_output': 'max', "batch_size": 32, "loss_function": "cross-entropy"}],
    ["Bi-LSTM_AsymetricLoss", BiLSTM, {'embedding_size': 200, 'hidden_size': 200, 'hidden_dim_lstm': 300, 'n_epochs': 6, 'process_output': 'max', "batch_size": 32, "loss_function": "AsymetricLoss"}],
    ["Beto-finetunning_cross_entropy", BetoEmbedding, {'hidden_size': 150, "batch_size": 32, 'take_mean': True, 'n_epochs': 4, 'loss_function': "cross-entropy"}],
    ["Beto-finetunning_asymetric", BetoEmbedding, {'hidden_size': 150, "batch_size": 32, 'take_mean': True, 'n_epochs': 4, 'loss_function': "asymetric"}]
]

for fold_index, (train, test) in enumerate(k_fold.split(df.preprocess_text.values, Y_true)):
    X_train_fold, X_test_fold = df.loc[train], df.loc[test]
    encoded_train_fold, encoded_test_fold = encoded[train], encoded[test]
    # Another embeddings
    for mode in ["fasttext", "elmo", "beto_embedding_mean", "beto_embedding_cls"]:
        X_train_fold[mode] = list(others_embeddings[mode][train])
        X_test_fold[mode] = list(others_embeddings[mode][test])
        
    # End-to-end embeddings
    for name, model_object, args in models:
        print(name, fold_index)
        model = model_object(**args)
        model.load_state_dict(torch.load(f'Models/Fold_{fold_index+1}_{name}.model'))
        if name[:4] == "Beto":
            all_embeddings_train = model.get_all_embeddings(X_train_fold.preprocess_text.values)
            all_embeddings_test = model.get_all_embeddings(X_test_fold.preprocess_text.values)
        else:
            all_embeddings_train = model.get_all_embeddings(encoded_train_fold)
            all_embeddings_test = model.get_all_embeddings(encoded_test_fold)
        X_train_fold[name] = list(all_embeddings_train)
        X_test_fold[name] = list(all_embeddings_test)
        
    # TF - IDF
    tf_idf_pipeline = Pipeline([
          ('vect', CountVectorizer(lowercase=False, stop_words = spanish_stopwords)),
          ('tfidf', TfidfTransformer(use_idf = True))])
    tf_idf_pipeline.fit(X_train_fold.preprocess_text.values) 
    X_train_fold["tf-idf"] = list(tf_idf_pipeline.transform(X_train_fold.preprocess_text.values).toarray())
    X_test_fold["tf-idf"] = list(tf_idf_pipeline.transform(X_test_fold.preprocess_text.values).toarray())
    
    np.save(f"datasets/fold_{fold_index+1}_train.npy", X_train_fold.values)
    np.save(f"datasets/fold_{fold_index+1}_test.npy", X_test_fold.values)

In [6]:
X_train_fold.head(2)

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral,fasttext,elmo,beto_embedding_mean,beto_embedding_cls,Bi-LSTM,Bi-LSTM_AsymetricLoss,Beto-finetunning_cross_entropy,Beto-finetunning_asymetric,tf-idf
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0,"[-0.10552764, -0.27450845, -0.04605328, -0.244...","[0.0596153, -0.49882528, -0.41697934, 0.367517...","[-0.23091996, -0.10888031, -0.41678736, 0.6554...","[0.36867273, -0.11450332, -0.70039237, 1.38560...","[0.9971235, 0.55105656, 0.03968364, 0.75402415...","[0.33348396, 0.020951403, 0.13927792, -0.02246...","[0.4382636, 0.2400094, 0.67646587, -0.12168871...","[0.2481719, -0.07553123, 0.15339072, -0.693724...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0,"[-0.11113898, -0.27680284, -0.06768549, -0.185...","[0.11967598, -0.7192024, -0.3501902, 0.4666095...","[-0.19575264, -0.063836716, -0.21124944, 0.567...","[0.78442883, -0.46931738, 0.7389578, 1.1557943...","[0.1252701, 0.48678556, -0.0014662797, 0.44017...","[0.08652939, 0.13094953, 0.26094568, 0.0071285...","[0.29874167, -0.14668378, -0.5163954, 0.054965...","[0.17326279, -0.07325241, 0.17947382, 0.368167...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
import zipfile
import os

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(
                os.path.join(root, file),
                os.path.relpath(os.path.join(root, file), path),
            )
            
path = "Models"
zipf = zipfile.ZipFile(f"{path}.zip", "w", zipfile.ZIP_DEFLATED)
zipdir(path, zipf)
zipf.close()

In [11]:
path = "datasets"
zipf = zipfile.ZipFile(f"{path}.zip", "w", zipfile.ZIP_DEFLATED)
zipdir(path, zipf)
zipf.close()