# Sentiment140

In this notebook, we try to predict the polarity of a twit.

https://medium.com/@alyafey22/sentiment-classification-from-keras-to-the-browser-7eda0d87cdc6


On this one, we use the sentiment140 dataset with a heavier preprocessing to display how much freedom is available to the end-user.

In [1]:
# imports
import matplotlib.pyplot as plt
from loguru import logger
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

from pathlib import Path
import re


from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GRU, Embedding
from keras.utils import np_utils

from subtest.dataset import Dataset
from subtest.scenario import Scenario, run_scenario

Using TensorFlow backend.


# Download and unzip data if needed

In [None]:
!curl https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip --output trainingandtestdata.zip
!unzip trainingandtestdata.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 77.5M    0 16384    0     0  17192      0  1:18:52 --:--:--  1:18:52 17192
  1 77.5M    1 1568k    0     0   804k      0  0:01:38  0:00:01  0:01:37  804k
 10 77.5M   10 8544k    0     0  2936k      0  0:00:27  0:00:02  0:00:25 2936k
 19 77.5M   19 14.9M    0     0  3916k      0  0:00:20  0:00:03  0:00:17 3916k
 29 77.5M   29 22.9M    0     0  4782k      0  0:00:16  0:00:04  0:00:12 4782k
 40 77.5M   40 31.5M    0     0  5468k      0  0:00:14  0:00:05  0:00:09 6517k
 52 77.5M   52 40.7M    0     0  6012k      0  0:00:13  0:00:06  0:00:07 8051k
 66 77.5M   66 51.9M    0     0  6725k      0  0:00:11  0:00:07  0:00:04 8930k
 80 77.5M   80 62.2M    0     0  7055k      0  0:00:11  0:00:09  0:00:02 9440k
 91 77.5M   91 71.1M    0     0  7359k      0  0:00

# Define preprocessing functions

In [2]:
def process(txt):
    out = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens


def tokenize(thresh = 5):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in x:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index


def create_sequences(data):
    tokens = []
    for txt in data:
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)        
    return np.array(tokens)

def preprocess_dataset_labels(y):
    y = np.array([e/4 for e in y])
    return y

# Create dataset

In [3]:
df_train = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "raw_unicode_escape", header=None)
df_test = pd.read_csv("testdata.manual.2009.06.14.csv", encoding = "raw_unicode_escape",  header=None)

df_train.columns = ["polarity", "id", "date", "query", "user", "text"]
df_test.columns = ["polarity", "id", "date", "query", "user", "text"]

# We keep only a fraction of the whole dataset

df_train = df_train.sample(frac = 0.1)

x = df_train["text"]
y = df_train["polarity"]

x = np.array(x)
y = np.array(y)

In [4]:
max_tokens = getMax(x)

num_words = None
word_index = tokenize()
num_words = len(word_index)

input_shape = max_tokens
num_classes = len(np.unique(y))


print('length of the dictionary ',len(word_index))
print('max token ', max_tokens) 
print('num classes', num_classes)

length of the dictionary  15224
max token  38
num classes 2


In [5]:
(X_train, X_test) = train_test_split(create_sequences(x), shuffle = False)
(y_train, y_test) = train_test_split(y, shuffle = False)

# Create custom scenario

In [6]:
scenario_params = {
    'partners_count': 3,
    'amounts_per_partner': [0.2, 0.5, 0.3],
}

scenario_params['epoch_count'] = 10
scenario_params['minibatch_count'] = 3

# Every other parametter will be set to its default value

experiment_path = Path(r"C:\GitHub\distributed-learning-contributivity\experiments\nlp")

current_scenario = Scenario(
        scenario_params,
        experiment_path
    )

2020-08-19 13:34:02.069 | DEBUG    | subtest.scenario:__init__:54 - Dataset selected: mnist
2020-08-19 13:34:02.070 | DEBUG    | subtest.scenario:__init__:89 - Computation use the full dataset for scenario #1
2020-08-19 13:34:02.167 | INFO     | subtest.scenario:__init__:281 - ### Description of data scenario configured:
2020-08-19 13:34:02.168 | INFO     | subtest.scenario:__init__:282 -    Number of partners defined: 3
2020-08-19 13:34:02.169 | INFO     | subtest.scenario:__init__:283 -    Data distribution scenario chosen: random
2020-08-19 13:34:02.169 | INFO     | subtest.scenario:__init__:284 -    Multi-partner learning approach: fedavg
2020-08-19 13:34:02.170 | INFO     | subtest.scenario:__init__:285 -    Weighting option: uniform
2020-08-19 13:34:02.170 | INFO     | subtest.scenario:__init__:286 -    Iterations parameters: 10 epochs > 3 mini-batches > 8 gradient updates per pass
2020-08-19 13:34:02.171 | INFO     | subtest.scenario:__init__:292 - ### Data loaded: mnist
2020-08

# Create Model

In [7]:
def generate_new_model_for_dataset():
    model = Sequential()
    embedding_size = 8
    model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

    model.add(GRU(units=16, name = "gru_1",return_sequences=True))
    model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
    model.add(GRU(units=4, name= "gru_3"))
    model.add(Dense(1, activation='sigmoid',name="dense_1"))
    model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])
    return model

# Assignate dataset to scenario

In [8]:
current_scenario.dataset = Dataset(
    "my_dataset",
    X_train,
    X_test,
    y_train,
    y_test,
    input_shape,
    num_classes,
    preprocess_dataset_labels,
    generate_new_model_for_dataset
)

# Split train and validation sets

In [9]:
current_scenario.dataset.train_val_split()

# Run scenario

In [11]:
run_scenario(current_scenario)

2020-08-19 13:34:02.488 | INFO     | subtest.scenario:split_data:536 - ### Splitting data among partners:
2020-08-19 13:34:02.488 | INFO     | subtest.scenario:split_data:537 -    Simple split performed.
2020-08-19 13:34:02.488 | INFO     | subtest.scenario:split_data:538 -    Nb of samples split amongst partners: 77760
2020-08-19 13:34:02.488 | INFO     | subtest.scenario:split_data:540 -    Partner #0: 15552 samples with labels [0, 4]
2020-08-19 13:34:02.494 | INFO     | subtest.scenario:split_data:540 -    Partner #1: 38880 samples with labels [0, 4]
2020-08-19 13:34:02.494 | INFO     | subtest.scenario:split_data:540 -    Partner #2: 23328 samples with labels [0, 4]
2020-08-19 13:34:02.810 | DEBUG    | subtest.scenario:compute_batch_sizes:584 -    Compute batch sizes, partner #0: 648
2020-08-19 13:34:02.811 | DEBUG    | subtest.scenario:compute_batch_sizes:584 -    Compute batch sizes, partner #1: 1620
2020-08-19 13:34:02.811 | DEBUG    | subtest.scenario:compute_batch_sizes:584 - 

2020-08-19 13:35:57.097 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.62
2020-08-19 13:36:02.204 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.63
2020-08-19 13:36:07.704 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #2 (2/2) > val_acc: 0.63
2020-08-19 13:36:07.707 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-19 13:36:09.896 | INFO     | subtest.multi_partner_learning:compute_test_score:184 -    Model evaluation at the end of the epoch: ['0.685', '0.631']
2020-08-19 13:36:09.897 | DEBUG    | subtest.multi_partner_learning:compute_test_score:187 -       Checking if early stopping criteria are met:
2020-08-19 1

2020-08-19 13:38:11.051 | DEBUG    | subtest.multi_partner_learning:compute_test_score:197 -          -> Early stopping criteria are not met, continuing with training.
2020-08-19 13:38:11.062 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new fedavg collaborative round ...
2020-08-19 13:38:11.063 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°0 of epoch n°4, init aggregated model for each partner with models from previous round
2020-08-19 13:38:19.324 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #0 (0/2) > val_acc: 0.67
2020-08-19 13:38:24.926 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #1 (1/2) > val_acc: 0.67
2020-08-19 13:38:31.121 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_res

2020-08-19 13:40:32.509 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-19 13:40:32.510 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new fedavg collaborative round ...
2020-08-19 13:40:32.510 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°1 of epoch n°6, init aggregated model for each partner with models from previous round
2020-08-19 13:40:41.343 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #0 (0/2) > val_acc: 0.74
2020-08-19 13:40:46.754 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #1 (1/2) > val_acc: 0.74
2020-08-19 13:40:52.246 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > 

2020-08-19 13:42:54.966 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-19 13:42:54.967 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new fedavg collaborative round ...
2020-08-19 13:42:54.967 | DEBUG    | subtest.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°2 of epoch n°8, init aggregated model for each partner with models from previous round
2020-08-19 13:43:02.974 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.75
2020-08-19 13:43:08.427 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.76
2020-08-19 13:43:14.035 | DEBUG    | subtest.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > 

0

# Results

In [12]:
df_results = current_scenario.to_dataframe()
print(df_results.mpl_test_score)

0    0.76815
Name: mpl_test_score, dtype: float64
