# Sentiment140

On this noteboook, we try to predict the polarity of a twitt.

https://medium.com/@alyafey22/sentiment-classification-from-keras-to-the-browser-7eda0d87cdc6


On this one, we use the sentiment140 dataset with a heavyer preprocessing to display how much freedom is avialable to the end-user.

In [1]:
# imports
import matplotlib.pyplot as plt
from loguru import logger
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

from pathlib import Path
import re


from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GRU, Embedding
from keras.utils import np_utils

from pkg import main, scenario
from pkg.dataset import Dataset

Using TensorFlow backend.


# Download and unzip data if needed

In [None]:
!curl https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip --output trainingandtestdata.zip
!unzip trainingandtestdata.zip

# Define preprocessing functions

In [4]:
def process(txt):
    out = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens


def tokenize(thresh = 5):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in x:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index


def create_sequences(data):
    tokens = []
    for txt in data:
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)        
    return np.array(tokens)

def preprocess_dataset_labels(y):
    y = np.array([e/4 for e in y])
    return y

# Let's see how our preprocessing works

In [6]:
process("Th**is. is -a- #test ")

['this', 'is', 'a', 'test']

In [7]:
print(create_sequences(["Th**is. is -a- #test "]))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0  336   21    8 1673]]


# Create dataset

In [23]:
df_train = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "raw_unicode_escape", header=None)
df_test = pd.read_csv("testdata.manual.2009.06.14.csv", encoding = "raw_unicode_escape",  header=None)

df_train.columns = ["polarity", "id", "date", "query", "user", "text"]
df_test.columns = ["polarity", "id", "date", "query", "user", "text"]

# We keep only a fraction of the whole dataset

df_train = df_train.sample(frac = 0.1)

x = df_train["text"]
y = df_train["polarity"]

x = np.array(x)
y = np.array(y)

In [24]:
max_tokens = getMax(x)

num_words = None
word_index = tokenize()
num_words = len(word_index)

input_shape = max_tokens
num_classes = len(np.unique(y))


print('length of the dictionary ',len(word_index))
print('max token ', max_tokens) 
print('num classes', num_classes)

length of the dictionary  15149
max token  38
num classes 2


In [None]:
(X_train, X_test) = train_test_split(create_sequences(x), shuffle = False)
(y_train, y_test) = train_test_split(y, shuffle = False)

# Create custom scenario

In [25]:
scenario_params = {
    'partners_count': 3,
    'amounts_per_partner': [0.2, 0.5, 0.3],
}

scenario_params['epoch_count'] = 10
scenario_params['minibatch_count'] = 3

# Every other parametter will be set to its default value

experiment_path = Path(r"C:\GitHub\distributed-learning-contributivity\experiments\nlp")

current_scenario = scenario.Scenario(
        scenario_params,
        experiment_path
    )

2020-08-12 12:06:57.664 | DEBUG    | pkg.scenario:__init__:52 - Dataset selected: mnist
2020-08-12 12:06:57.665 | DEBUG    | pkg.scenario:__init__:87 - Computation use the full dataset for scenario #1
2020-08-12 12:06:57.739 | INFO     | pkg.scenario:__init__:279 - ### Description of data scenario configured:
2020-08-12 12:06:57.740 | INFO     | pkg.scenario:__init__:280 -    Number of partners defined: 3
2020-08-12 12:06:57.741 | INFO     | pkg.scenario:__init__:281 -    Data distribution scenario chosen: random
2020-08-12 12:06:57.741 | INFO     | pkg.scenario:__init__:282 -    Multi-partner learning approach: fedavg
2020-08-12 12:06:57.742 | INFO     | pkg.scenario:__init__:283 -    Weighting option: uniform
2020-08-12 12:06:57.742 | INFO     | pkg.scenario:__init__:284 -    Iterations parameters: 10 epochs > 3 mini-batches > 8 gradient updates per pass
2020-08-12 12:06:57.743 | INFO     | pkg.scenario:__init__:290 - ### Data loaded: mnist
2020-08-12 12:06:57.743 | INFO     | pkg.sc

# Create Model

In [28]:
def generate_new_model_for_dataset():
    model = Sequential()
    embedding_size = 8
    model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

    model.add(GRU(units=16, name = "gru_1",return_sequences=True))
    model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
    model.add(GRU(units=4, name= "gru_3"))
    model.add(Dense(1, activation='sigmoid',name="dense_1"))
    model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])
    return model

# Assignate dataset to scenario

In [29]:
current_scenario.dataset = Dataset(
    "my_dataset",
    X_train,
    X_test,
    y_train,
    y_test,
    input_shape,
    num_classes,
    preprocess_dataset_labels,
    generate_new_model_for_dataset
)

# Split train and validation sets

In [31]:
current_scenario.dataset.train_val_split()

# Legacy 

In [32]:
current_scenario.partners_list = []

# Run scenario

In [33]:
main.run_scenario(current_scenario)

2020-08-12 12:07:16.225 | INFO     | pkg.scenario:split_data:534 - ### Splitting data among partners:
2020-08-12 12:07:16.226 | INFO     | pkg.scenario:split_data:535 -    Simple split performed.
2020-08-12 12:07:16.226 | INFO     | pkg.scenario:split_data:536 -    Nb of samples split amongst partners: 77760
2020-08-12 12:07:16.227 | INFO     | pkg.scenario:split_data:538 -    Partner #0: 15552 samples with labels [0, 4]
2020-08-12 12:07:16.227 | INFO     | pkg.scenario:split_data:538 -    Partner #1: 38880 samples with labels [0, 4]
2020-08-12 12:07:16.228 | INFO     | pkg.scenario:split_data:538 -    Partner #2: 23328 samples with labels [0, 4]
2020-08-12 12:07:16.348 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #0: 648
2020-08-12 12:07:16.349 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #1: 1620
2020-08-12 12:07:16.350 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #2: 

2020-08-12 12:09:02.092 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.65
2020-08-12 12:09:07.267 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.65
2020-08-12 12:09:12.646 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #2 (2/2) > val_acc: 0.65
2020-08-12 12:09:12.650 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-12 12:09:14.746 | INFO     | pkg.multi_partner_learning:compute_test_score:184 -    Model evaluation at the end of the epoch: ['0.678', '0.654']
2020-08-12 12:09:14.747 | DEBUG    | pkg.multi_partner_learning:compute_test_score:187 -       Checking if early stopping criteria are met:
2020-08-12 12:09:14.748 | DEBUG    |

2020-08-12 12:11:12.971 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new fedavg collaborative round ...
2020-08-12 12:11:12.972 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°0 of epoch n°4, init aggregated model for each partner with models from previous round
2020-08-12 12:11:21.724 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #0 (0/2) > val_acc: 0.7
2020-08-12 12:11:27.001 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #1 (1/2) > val_acc: 0.7
2020-08-12 12:11:32.446 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #2 (2/2) > val_acc: 0.7
2020-08-12 12:11:32.450 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of f

2020-08-12 12:13:29.288 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°1 of epoch n°6, init aggregated model for each partner with models from previous round
2020-08-12 12:13:38.378 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #0 (0/2) > val_acc: 0.75
2020-08-12 12:13:43.805 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #1 (1/2) > val_acc: 0.75
2020-08-12 12:13:49.202 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #2 (2/2) > val_acc: 0.75
2020-08-12 12:13:49.216 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-12 12:13:49.217 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new f

2020-08-12 12:15:57.018 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.77
2020-08-12 12:16:02.327 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.77
2020-08-12 12:16:07.686 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #2 (2/2) > val_acc: 0.77
2020-08-12 12:16:07.704 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-12 12:16:09.796 | INFO     | pkg.multi_partner_learning:compute_test_score:184 -    Model evaluation at the end of the epoch: ['0.497', '0.769']
2020-08-12 12:16:09.796 | DEBUG    | pkg.multi_partner_learning:compute_test_score:187 -       Checking if early stopping criteria are met:
2020-08-12 12:16:09.796 | DEBUG    |

0

# Results

In [18]:
df_results = current_scenario.to_dataframe()
print(df_results.mpl_test_score)

Index(['aggregation_weighting', 'dataset_fraction_per_partner', 'dataset_name',
       'epoch_count', 'final_relative_nb_samples',
       'gradient_updates_per_pass_count', 'is_early_stopping',
       'learning_computation_time_sec', 'minibatch_count',
       'mpl_nb_epochs_done', 'mpl_test_score',
       'multi_partner_learning_approach', 'nb_samples_used', 'partners_count',
       'samples_split_description', 'scenario_name', 'short_scenario_name',
       'test_data_samples_count', 'train_data_samples_count', 'random_state',
       'scenario_id'],
      dtype='object')
