# Sentiment140

On this noteboook, we try to predict the polarity of a twitt.

https://medium.com/@alyafey22/sentiment-classification-from-keras-to-the-browser-7eda0d87cdc6

In [2]:
# imports
import matplotlib.pyplot as plt
from loguru import logger
import tensorflow as tf
import pandas as pd 
from sklearn.model_selection import train_test_split

from pathlib import Path

import numpy as np
 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GRU, Embedding
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.datasets import mnist
import re


In [3]:
from pkg import constants
from pkg import main
from pkg import multi_partner_learning
from pkg import scenario
from pkg import utils
from pkg.dataset import Dataset

# Download and unzip data if needed

In [None]:
!curl https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip --output trainingandtestdata.zip
!unzip trainingandtestdata.zip

# Load it

In [4]:
df_train = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "raw_unicode_escape", header=None)
df_test = pd.read_csv("testdata.manual.2009.06.14.csv", encoding = "raw_unicode_escape",  header=None)



df_train.columns = ["polarity", "id", "date", "query", "user", "text"]
df_test.columns = ["polarity", "id", "date", "query", "user", "text"]

# We keep only a fraction of the whole dataset

df_train = df_train.sample(frac = 0.1)

x = df_train["text"]
y = df_train["polarity"]

# Preprocessing functions

In [5]:
def process(txt):
    out = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens


def tokenize(thresh = 5):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in x:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index


def create_sequences(data):
    tokens = []
    for txt in data:
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)        
    return np.array(tokens)

def preprocess_dataset_labels(y):
    #y = np_utils.to_categorical(y, num_classes)
    y = np.array([e/4 for e in y])
    return y

In [6]:
max_tokens = getMax(x)


num_words = None
word_index = tokenize()
num_words = len(word_index)

input_shape = max_tokens
num_classes = len(np.unique(y))


print('length of the dictionary ',len(word_index))
print('max token ', max_tokens) 
print('num classes', num_classes)

length of the dictionary  15136
max token  39
num classes 2


# Let's see how our preprocessing works

In [7]:
process("Th**is. is -a- #test ")

['this', 'is', 'a', 'test']

In [8]:
print(create_sequences(["Th**is. is -a- #test "]))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0  134  153  110 1552]]


# Create custom scenario from mandatory parametters

In [9]:
scenario_params = {
    'partners_count': 3,
    'amounts_per_partner': [0.2, 0.5, 0.3],
}

# Set values for optinal parametters

In [10]:
scenario_params['epoch_count'] = 10
scenario_params['minibatch_count'] = 3

#### Every other parametter will be set to its default value

In [11]:
experiment_path = Path(r"C:\GitHub\distributed-learning-contributivity\experiments\nlp")

In [12]:
current_scenario = scenario.Scenario(
        scenario_params,
        experiment_path
    )

2020-08-10 16:26:29.628 | DEBUG    | pkg.scenario:__init__:52 - Dataset selected: mnist
2020-08-10 16:26:29.629 | DEBUG    | pkg.scenario:__init__:87 - Computation use the full dataset for scenario #1
2020-08-10 16:26:29.701 | INFO     | pkg.scenario:__init__:279 - ### Description of data scenario configured:
2020-08-10 16:26:29.702 | INFO     | pkg.scenario:__init__:280 -    Number of partners defined: 3
2020-08-10 16:26:29.703 | INFO     | pkg.scenario:__init__:281 -    Data distribution scenario chosen: random
2020-08-10 16:26:29.703 | INFO     | pkg.scenario:__init__:282 -    Multi-partner learning approach: fedavg
2020-08-10 16:26:29.704 | INFO     | pkg.scenario:__init__:283 -    Weighting option: uniform
2020-08-10 16:26:29.704 | INFO     | pkg.scenario:__init__:284 -    Iterations parameters: 10 epochs > 3 mini-batches > 8 gradient updates per pass
2020-08-10 16:26:29.705 | INFO     | pkg.scenario:__init__:290 - ### Data loaded: mnist
2020-08-10 16:26:29.705 | INFO     | pkg.sc

# Create Data Set

In [13]:
(X_train, X_test) = train_test_split(create_sequences(x), shuffle = False)
(y_train, y_test) = train_test_split(y, shuffle = False)



In [14]:
# idk

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [15]:
print(X_train[0])
print(y_train[0])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1  2  3  4  5  6  5  7  8  9 10 11]
4


# Create Model

In [16]:
def generate_new_model_for_dataset():
    model = Sequential()
    embedding_size = 8
    model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

    model.add(GRU(units=16, name = "gru_1",return_sequences=True))
    model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
    model.add(GRU(units=4, name= "gru_3"))
    model.add(Dense(1, activation='sigmoid',name="dense_1"))
    model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])
    return model

# Assignate dataset to scenario

In [17]:
current_scenario.dataset = Dataset(
    "my_dataset",
    X_train,
    X_test,
    y_train,
    y_test,
    input_shape,
    num_classes,
    preprocess_dataset_labels,
    generate_new_model_for_dataset
)

In [18]:
# Check Scenario name
print(current_scenario.dataset.name)

my_dataset


# Split train and validation sets

In [19]:
current_scenario.dataset.train_val_split()

# Legacy 

In [20]:
current_scenario.partners_list = []

# Run scenario

In [21]:
main.run_scenario(current_scenario)

2020-08-10 16:26:32.071 | INFO     | pkg.scenario:split_data:534 - ### Splitting data among partners:
2020-08-10 16:26:32.072 | INFO     | pkg.scenario:split_data:535 -    Simple split performed.
2020-08-10 16:26:32.072 | INFO     | pkg.scenario:split_data:536 -    Nb of samples split amongst partners: 77760
2020-08-10 16:26:32.073 | INFO     | pkg.scenario:split_data:538 -    Partner #0: 15552 samples with labels [0, 4]
2020-08-10 16:26:32.073 | INFO     | pkg.scenario:split_data:538 -    Partner #1: 38880 samples with labels [0, 4]
2020-08-10 16:26:32.074 | INFO     | pkg.scenario:split_data:538 -    Partner #2: 23328 samples with labels [0, 4]
2020-08-10 16:26:32.212 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #0: 648
2020-08-10 16:26:32.213 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #1: 1620
2020-08-10 16:26:32.213 | DEBUG    | pkg.scenario:compute_batch_sizes:582 -    Compute batch sizes, partner #2: 

2020-08-10 16:28:26.332 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.63
2020-08-10 16:28:32.062 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.62
2020-08-10 16:28:37.799 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 01/09 > Minibatch 02/02 > Partner id #2 (2/2) > val_acc: 0.64
2020-08-10 16:28:37.803 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-10 16:28:40.037 | INFO     | pkg.multi_partner_learning:compute_test_score:184 -    Model evaluation at the end of the epoch: ['0.681', '0.630']
2020-08-10 16:28:40.038 | DEBUG    | pkg.multi_partner_learning:compute_test_score:187 -       Checking if early stopping criteria are met:
2020-08-10 16:28:40.039 | DEBUG    |

2020-08-10 16:30:48.891 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new fedavg collaborative round ...
2020-08-10 16:30:48.892 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°0 of epoch n°4, init aggregated model for each partner with models from previous round
2020-08-10 16:30:57.213 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #0 (0/2) > val_acc: 0.68
2020-08-10 16:31:02.758 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #1 (1/2) > val_acc: 0.69
2020-08-10 16:31:09.391 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 04/09 > Minibatch 00/02 > Partner id #2 (2/2) > val_acc: 0.68
2020-08-10 16:31:09.396 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End o

2020-08-10 16:33:18.710 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:271 - (fedavg) Minibatch n°1 of epoch n°6, init aggregated model for each partner with models from previous round
2020-08-10 16:33:28.017 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #0 (0/2) > val_acc: 0.74
2020-08-10 16:33:33.636 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #1 (1/2) > val_acc: 0.74
2020-08-10 16:33:39.338 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 06/09 > Minibatch 01/02 > Partner id #2 (2/2) > val_acc: 0.74
2020-08-10 16:33:39.343 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-10 16:33:39.343 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:259 - Start new f

2020-08-10 16:36:04.665 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #0 (0/2) > val_acc: 0.76
2020-08-10 16:36:10.661 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #1 (1/2) > val_acc: 0.76
2020-08-10 16:36:17.390 | DEBUG    | pkg.multi_partner_learning:log_collaborative_round_partner_result:513 - Epoch 08/09 > Minibatch 02/02 > Partner id #2 (2/2) > val_acc: 0.76
2020-08-10 16:36:17.395 | DEBUG    | pkg.multi_partner_learning:compute_collaborative_round_fedavg:303 - End of fedavg collaborative round.
2020-08-10 16:36:19.762 | INFO     | pkg.multi_partner_learning:compute_test_score:184 -    Model evaluation at the end of the epoch: ['0.513', '0.764']
2020-08-10 16:36:19.763 | DEBUG    | pkg.multi_partner_learning:compute_test_score:187 -       Checking if early stopping criteria are met:
2020-08-10 16:36:19.763 | DEBUG    |

0

# Results

In [22]:
df_results = current_scenario.to_dataframe()
df_results["random_state"] = 1
df_results["scenario_id"] = 1
print(df_results.columns)

Index(['aggregation_weighting', 'dataset_fraction_per_partner', 'dataset_name',
       'epoch_count', 'final_relative_nb_samples',
       'gradient_updates_per_pass_count', 'is_early_stopping',
       'learning_computation_time_sec', 'minibatch_count',
       'mpl_nb_epochs_done', 'mpl_test_score',
       'multi_partner_learning_approach', 'nb_samples_used', 'partners_count',
       'samples_split_description', 'scenario_name', 'short_scenario_name',
       'test_data_samples_count', 'train_data_samples_count', 'random_state',
       'scenario_id'],
      dtype='object')


In [23]:
print(df_results.mpl_test_score)

0    0.7683
Name: mpl_test_score, dtype: float64


In [24]:
print(current_scenario.mpl)

<pkg.multi_partner_learning.MultiPartnerLearning object at 0x000002B08508DC08>


# Extract model 

In [25]:
model = current_scenario.mpl.get_model()

In [26]:
model.evaluate(X_test, preprocess_dataset_labels(y_test))



[0.5072355603456498, 0.7682999968528748]

Seems nice as a federated learning result