# Sentiment140

In this notebook, we try to predict the polarity of a twit.

https://medium.com/@alyafey22/sentiment-classification-from-keras-to-the-browser-7eda0d87cdc6


On this one, we use the [sentiment140](http://help.sentiment140.com/for-students) dataset with a heavier preprocessing to display how much freedom is available to the end-user.

In [1]:
!wget https://raw.githubusercontent.com/SubstraFoundation/distributed-learning-contributivity/Moving-functions/requirements.txt
!pip install -r requirements.txt
!pip install -i https://test.pypi.org/simple/ subtest==0.0.0.8

--2020-08-26 11:21:16--  https://raw.githubusercontent.com/SubstraFoundation/distributed-learning-contributivity/Moving-functions/requirements.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 176 [text/plain]
Saving to: ‘requirements.txt’


2020-08-26 11:21:17 (4.24 MB/s) - ‘requirements.txt’ saved [176/176]

Collecting Keras==2.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
[K     |████████████████████████████████| 378kB 2.8MB/s 
[?25hCollecting matplotlib==3.1.3
[?25l  Downloading https://files.pythonhosted.org/packages/7e/07/4b361d6d0f4e08942575f83a11d33f36897e1aae4279046606dd1808778a/matplotlib-3.1.3-cp36-cp36m-manylinux1_x86_64.w

Looking in indexes: https://test.pypi.org/simple/
Collecting subtest==0.0.0.8
[?25l  Downloading https://test-files.pythonhosted.org/packages/9f/62/c90051a4e9247eb15f8ac5af1dc92ff7cfbd6ccfbbc763499398c99af44a/subtest-0.0.0.8-py3-none-any.whl (43kB)
[K     |████████████████████████████████| 51kB 638kB/s 
Installing collected packages: subtest
Successfully installed subtest-0.0.0.8


In [2]:
# imports
import matplotlib.pyplot as plt
from loguru import logger
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

from pathlib import Path
import re


from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GRU, Embedding
from keras.utils import np_utils

from subtest.dataset import Dataset
from subtest.scenario import Scenario, run_scenario

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


# Download and unzip data if needed

In [3]:
!curl https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip --output trainingandtestdata.zip
!unzip trainingandtestdata.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 77.5M  100 77.5M    0     0  15.0M      0  0:00:05  0:00:05 --:--:-- 15.9M
Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


# Define preprocessing functions

In [4]:
def process(txt):
    out = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens


def tokenize(thresh = 5):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in x:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index


def create_sequences(data):
    tokens = []
    for txt in data:
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)        
    return np.array(tokens)

def preprocess_dataset_labels(y):
    y = np.array([e/4 for e in y])
    return y

# Create dataset

In [5]:
df_train = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "raw_unicode_escape", header=None)
df_test = pd.read_csv("testdata.manual.2009.06.14.csv", encoding = "raw_unicode_escape",  header=None)

df_train.columns = ["polarity", "id", "date", "query", "user", "text"]
df_test.columns = ["polarity", "id", "date", "query", "user", "text"]

# We keep only a fraction of the whole dataset

df_train = df_train.sample(frac = 0.1)

x = df_train["text"]
y = df_train["polarity"]

x = np.array(x)
y = np.array(y)

In [6]:
max_tokens = getMax(x)

num_words = None
word_index = tokenize()
num_words = len(word_index)

input_shape = max_tokens
num_classes = len(np.unique(y))


print('length of the dictionary ',len(word_index))
print('max token ', max_tokens) 
print('num classes', num_classes)

length of the dictionary  15174
max token  39
num classes 2


In [7]:
(X_train, X_test) = train_test_split(create_sequences(x), shuffle = False)
(y_train, y_test) = train_test_split(y, shuffle = False)

# Create custom scenario

In [8]:
scenario_params = {
    'partners_count': 3,
    'amounts_per_partner': [0.2, 0.5, 0.3],
}

scenario_params['epoch_count'] = 10
scenario_params['minibatch_count'] = 3

# Every other parametter will be set to its default value

current_scenario = Scenario(scenario_params)

2020-08-26 11:23:59.182 | DEBUG    | subtest.scenario:__init__:58 - Dataset selected: mnist
2020-08-26 11:23:59.186 | DEBUG    | subtest.scenario:__init__:93 - Computation use the full dataset for scenario #1
2020-08-26 11:23:59.332 | INFO     | subtest.scenario:__init__:282 - ### Description of data scenario configured:
2020-08-26 11:23:59.333 | INFO     | subtest.scenario:__init__:283 -    Number of partners defined: 3
2020-08-26 11:23:59.334 | INFO     | subtest.scenario:__init__:284 -    Data distribution scenario chosen: random
2020-08-26 11:23:59.337 | INFO     | subtest.scenario:__init__:285 -    Multi-partner learning approach: fedavg
2020-08-26 11:23:59.339 | INFO     | subtest.scenario:__init__:286 -    Weighting option: uniform
2020-08-26 11:23:59.341 | INFO     | subtest.scenario:__init__:287 -    Iterations parameters: 10 epochs > 3 mini-batches > 8 gradient updates per pass
2020-08-26 11:23:59.343 | INFO     | subtest.scenario:__init__:293 - ### Data loaded: mnist
2020-08

# Create Model

In [9]:
def generate_new_model_for_dataset():
    model = Sequential()
    embedding_size = 8
    model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

    model.add(GRU(units=16, name = "gru_1",return_sequences=True))
    model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
    model.add(GRU(units=4, name= "gru_3"))
    model.add(Dense(1, activation='sigmoid',name="dense_1"))
    model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])
    return model

# Assignate dataset to scenario

In [10]:
current_scenario.dataset = Dataset(
    "my_dataset",
    X_train,
    X_test,
    y_train,
    y_test,
    input_shape,
    num_classes,
    preprocess_dataset_labels,
    generate_new_model_for_dataset
)

# Split train and validation sets

In [11]:
current_scenario.dataset.train_val_split()

# Run scenario

In [12]:
run_scenario(current_scenario)

2020-08-26 11:23:59.507 | INFO     | subtest.scenario:split_data:537 - ### Splitting data among partners:
2020-08-26 11:23:59.611 | INFO     | subtest.scenario:split_data:538 -    Simple split performed.
2020-08-26 11:23:59.614 | INFO     | subtest.scenario:split_data:539 -    Nb of samples split amongst partners: 77760
2020-08-26 11:23:59.615 | INFO     | subtest.scenario:split_data:541 -    Partner #0: 15552 samples with labels [0, 4]
2020-08-26 11:23:59.617 | INFO     | subtest.scenario:split_data:541 -    Partner #1: 38880 samples with labels [0, 4]
2020-08-26 11:23:59.618 | INFO     | subtest.scenario:split_data:541 -    Partner #2: 23328 samples with labels [0, 4]
2020-08-26 11:23:59.900 | DEBUG    | subtest.scenario:compute_batch_sizes:585 -    Compute batch sizes, partner #0: 648
2020-08-26 11:23:59.901 | DEBUG    | subtest.scenario:compute_batch_sizes:585 -    Compute batch sizes, partner #1: 1620
2020-08-26 11:23:59.901 | DEBUG    | subtest.scenario:compute_batch_sizes:585 - 

0

# Results

In [13]:
df_results = current_scenario.to_dataframe()
print(df_results.mpl_test_score)

0    0.7673
Name: mpl_test_score, dtype: float64
