Upload the dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving training.1600000.processed.noemoticon.csv to training.1600000.processed.noemoticon (1).csv


Install the required packages

In [None]:
!sudo apt -y install libportaudio2
!pip install -q tflite-model-maker-nightly

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libportaudio2 is already the newest version (19.6.0-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Import the required packages

In [None]:
import numpy as np
import pandas as pd
import os
import io
import re

from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker.config import ExportFormat
from tflite_model_maker.text_classifier import AverageWordVecSpec
from tflite_model_maker.text_classifier import DataLoader

import tensorflow as tf
assert tf.__version__.startswith('2')
tf.get_logger().setLevel('ERROR')

# Keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Scikit-learn
from sklearn.model_selection import train_test_split

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

Settings

In [None]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

Load dataset

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['training.1600000.processed.noemoticon.csv']), names=DATASET_COLUMNS)

Map target label to String
* **0** -> **NEGATIVE**
* **2** -> **NEUTRAL**
* **4** -> **POSITIVE**

In [None]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: user 505 ms, sys: 8 ms, total: 513 ms
Wall time: 511 ms


In [None]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,NEGATIVE,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,NEGATIVE,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,NEGATIVE,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,NEGATIVE,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,NEGATIVE,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Pre-Process dataset

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [None]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 1min 4s, sys: 482 ms, total: 1min 5s
Wall time: 1min 6s


Trim unnecessary columns

In [None]:
df = df.drop(['ids', 'date', 'flag', 'user'], axis=1)

In [None]:
df.head()

Unnamed: 0,target,text
0,NEGATIVE,awww bummer shoulda got david carr third day
1,NEGATIVE,upset update facebook texting might cry result...
2,NEGATIVE,dived many times ball managed save 50 rest go ...
3,NEGATIVE,whole body feels itchy like fire
4,NEGATIVE,behaving mad see


Write processed data to file

In [None]:
df.to_csv('/training.1600000.processed.noemoticon.processed.csv', encoding='utf-8')

Split train and test

In [None]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1280000
TEST size: 320000


In [None]:
df_train.to_csv('/training.1600000.processed.noemoticon.processed.train.csv', encoding='utf-8')
df_test.to_csv('/training.1600000.processed.noemoticon.processed.test.csv', encoding='utf-8')

Train model

In [None]:
awv_spec = model_spec.get('average_word_vec')

In [None]:
train_data = DataLoader.from_csv(
      filename='/training.1600000.processed.noemoticon.processed.train.csv',
      text_column='text',
      label_column='target',
      model_spec=awv_spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/training.1600000.processed.noemoticon.processed.test.csv',
      text_column='text',
      label_column='target',
      model_spec=awv_spec,
      is_training=False)

In [None]:
model = text_classifier.create(train_data, model_spec=awv_spec, epochs=10)

Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9
Epoch 10/10


Evaluate model with test data

In [None]:
loss, acc = model.evaluate(test_data)



In [None]:
model.summary

<bound method CustomModel.summary of <tensorflow_examples.lite.model_maker.core.task.text_classifier.TextClassifier object at 0x7eff61eb1750>>

Export .tflite file

In [None]:
model.export(export_dir='/')

**References**
* Text classification with TensorFlow Lite Model Maker https://www.tensorflow.org/lite/models/modify/model_maker/text_classification
* Twitter Sentiment Analysis https://www.kaggle.com/code/paoloripamonti/twitter-sentiment-analysis/notebook