# Pretrained NNLM

Adapted from https://www.tensorflow.org/hub/tutorials/tf2_text_classification

In [None]:
# Imports 
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt

In [None]:
# Required Installations
!pip install tensorflow_hub
!pip install tensorflow_datasets

In [None]:
import pandas as pd

data_path = '../data/'
# Load training set

pos = pd.read_table(data_path + "train_pos_full.txt", sep='.\n', names=['tweet'], engine='python')
pos['label']=1
print(f"Loaded POS data, correctly interpreted 1-tweet-per-line fashion : {pos.shape[0]==1_250_000}")
neg = pd.read_table(data_path + "train_neg_full.txt", sep='.\n', names=['tweet'], engine='python')
neg['label']=-1
print(f"Loaded NEG data, correctly interpreted 1-tweet-per-line fashion : {neg.shape[0]==1_250_000}")
print(f"Data sizes : (POS) {pos.shape[0]} (NEG) {neg.shape[0]}\n")

In [None]:
tweets = pos.merge(neg, how='outer')
tweets

In [None]:
tweets_tensor = tf.constant(tweets['tweet'].values)
labels_tensor = tf.constant(tweets['label'].values)
dataset = (tweets_tensor, labels_tensor)
dataset

In [None]:
train_data=dataset
train_examples, train_labels = tfds.as_numpy(train_data)

In [None]:
## Build the model
#model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
#Largest model
model = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"

hub_layer = hub.KerasLayer(model, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)
 

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])


In [None]:
x_val = train_examples[:10000]
partial_x_train = train_examples[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=4,
                    batch_size=500000,
                    validation_data=(x_val, y_val),
                    verbose=1)

In [None]:
history_dict = history.history
history_dict.keys()

## Test the model

# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]

# Load the testing data
test = pd.read_fwf(data_path +"test_data.txt", sep="\n", header=None)
test.index = pd.RangeIndex(start=1, stop=10001, step=1) # Format asked by AI Crowd
test = test[0].map(extract_tweet)
test = pd.DataFrame(test)
test.columns = ['tweet']
test

test_tensor=test['tweet'].values

def log_odd_convert(x):
    return -1 if x<0 else 1
results = list(map(lambda x:log_odd_convert(x), model.predict(test_tensor)))
test['label']=results

## Export the result

test = test.drop('tweet', axis=1)
test.index.name='Id'
test = test.rename(columns={'label':'Prediction'})
test

with open(data_path + "submission.csv", 'w') as f:
    test.to_csv(f)

# BERT 

In [None]:
!pip install ktrain

In [None]:
# Imports
import ktrain
from ktrain import text

## Data Preprocessing

In [None]:
## Data Preprocessing

In [None]:
abbr = {
  # SMS abbreviations
  'omg': 'oh my god',
  'afk':'away from keyboard',
  'bf':'boyfriend',
  'bff':'best friend forever',
  'lol' : 'laughing out loud',
  'irl' : 'in my opinion',
  'gf' :'girlfriend',
  'idk' : "i don't know",
  'fyi':'for your information',
  'asap' : 'as soon as possible',
  'yolo':'you live only once',
  'smh':'shaking my head',
  'btw' : 'by the way',
  'otw':'on the way',
  'msg':'message',
  'ppl' : 'people',
  'np' : 'no problem',
  'imy':'i miss you',
  'jk' : 'just kidding',
  'fyi' : 'for your information',
  'idc' : "i don't care",
  'gg' : 'good game',
  'thx' : 'thanks',
  'lmao' : 'laughing my ass off',
  'ily':'i love you',
  'rofl' : 'rolling on floor laughing',
  'stfu' : 'shut the fuck up',
  'y' : 'you',
  'yolo':'you only live once',
  'wtf' : 'what the fuck',
  'wth':'what the hell',
  # smileys
  ':|' : "i'm indecisive",
  ':[' : "i'm sad",
  ':@' : "i'm angry",
  ':{' : "i'm sad",
  'xd' : "i'm laughing",
  ':/' : "i'm skeptical",
  ':p' : "i'm cheeky",
  ':d' : "i'm smiling",
  ':$' : "i'm embarrassed",
  ":')" : "i'm joyful",
  '=)' : "i'm smiling",
  'd:' : "i'm smiling",
  'xx' : 'two kisses',
  'xxx' : 'hugs and kisses',
  'xoxo' : 'hugs and kisses',
  ':o' : "i'm surprised",
  '<3' : 'love'
 }

def formalize(tweet):
    return ' '.join([abbr.get(x, x) for x in tweet.split()])

In [None]:
from sklearn.utils import resample

# Define Preprocessors

def clean_HTML_tags(series) :
    return series.str.replace('<\/*[a-zA-Z]+>', '', regex=True)

def pretrain_process(pos_,neg_):
    pos = pos_.copy()
    neg = neg_.copy()

    # Drop duplicates
    pos = pos.drop_duplicates()
    neg = neg.drop_duplicates()
    # Balance classes after having deleted duplicates
    min_size = min(pos.shape[0], neg.shape[0])
    pos = resample(pos, n_samples=min_size, replace=False)
    neg = resample(neg, n_samples=min_size, replace=False)

    pos['tweet'] = clean_HTML_tags(pos['tweet']).values
    neg['tweet'] = clean_HTML_tags(neg['tweet']).values
    pos['tweet'] = list(map(lambda x : formalize(x), pos['tweet'].values))
    neg['tweet'] = list(map(lambda x : formalize(x), neg['tweet'].values))

    return pos.merge(neg, how='outer')

def preprocessing_test(test_):
    test = test_.copy()
    test['tweet'] = clean_HTML_tags(test['tweet']).values
    test['tweet'] = list(map(lambda x : formalize(x), test['tweet'].values))
    return test

tweets = pretrain_process(pos, neg)

## Tuning learning rates

In [None]:
tweets_ = resample(tweets, n_samples=10_000, replace=False)

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(tweets_,
                                                                   text_column='tweet',
                                                                   label_columns=['label'],
                                                                   preprocess_mode='bert', 
                                                                   lang='en',
                                                                   verbose=True
                                                                   )

model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=12)

# Uncomment to tune the learning rates
# learner.lr_find(max_epochs=5, show_plot=True)

## Training

####  Training report of the best prediction
1st cycle : 8e-4, 10'000 samples, accuracy of  0.5542  
2nd cycle : 2e-5, 30,000 samples, accuracy of  0.8147  
3rd cycle : 2e-5, 30,000 samples, accuracy of  0.8512  
4th cycle : 2e-5, 30,000 samples, accuracy of  0.8554    
5th cycle : 1e-5, 30,000 samples, accuracy of  0.8588     
6th cycle : 1e-5, 30,000 samples, accuracy of  0.8702    
7th cycle : 1e-5, 30,000 samples, accuracy of  0.8713     
8th cycle : 5e-6, 30,000 samples, accuracy of  0.8695    
9th cycle : 5e-6, 30,000 samples, accuracy of  0.8707    
10th cycle: 5e-7, 30,000 samples, accuracy of  0.8735    

In [None]:
# Initialize the model 
initial_n_samples = 10_000

tweets_ = resample(tweets, n_samples=initial_n_samples, replace=False)
train_n_samples = int(initial_n_samples*(1-0.01))
train_df = tweets_[:train_n_samples]
test_df = tweets_[train_n_samples:]

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(tweets_,
                                                                   text_column='tweet',
                                                                   label_columns=['label'],
                                                                   val_df=test_df,
                                                                   preprocess_mode='bert', 
                                                                   lang='en',
                                                                   verbose=True
                                                                   )
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

In [None]:
def run_one_cycle(model, tweets, n_samples, lr, batch_size, n_epochs=1):
'''
Fits the model with the given parameters using a cyclical learning rate
'''
    tweets_ = resample(tweets, n_samples=n_samples, replace=False)
    
    train_n_samples = int(n_samples*(1-0.01)) # nb of training samples
    train_df = tweets_[:train_n_samples]
    test_df = tweets_[train_n_samples:] # Validation set
    
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(tweets_,
                                                                   text_column='tweet',
                                                                   label_columns=['label'],
                                                                   val_df=test_df,
                                                                   preprocess_mode='bert', 
                                                                   lang='en',
                                                                   verbose=True
                                                                   )
    learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=batch_size)
    learner.fit_onecycle(lr, n_epochs)
    return learner

In [None]:
# Running this cell multiple times will train the model 

learner = run_one_cycle(model, tweets, 10, 2e-5, 6) #default lr is 2e-5

## Make the predictions

In [None]:
# Load the testing data
test = pd.read_fwf(data_path + "test_data.txt", sep="\n", header=None)
# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]
# Formatting for AI Crowd
test.index = pd.RangeIndex(start=1, stop=10001, step=1) 
test = test[0].map(extract_tweet)
test = pd.DataFrame(test)
test.columns = ['tweet']
test= preprocessing_test(test)
test_values=test['tweet'].values
# Make the predictions
test['label']=p.predict(test_values)
test

In [None]:
## Export the result
test = test.drop('tweet', axis=1)
test.index.name='Id'
test = test.rename(columns={'label':'Prediction'})

with open(data_path + "submission.csv", 'w') as f:
    test.to_csv(f)

# Best accuracy

In [None]:
'''
Running this cell will reproduce the submission file with the best predictions


Note : batch_size may need to be decreased according to the setup
Computation may take many hours
'''

# Imports
from sklearn.utils import resample
import pandas as pd
import ktrain
from ktrain import text

data_path = '../data/'

# Load training set
pos = pd.read_table(data_path + "train_pos_full.txt", sep='.\n', names=['tweet'], engine='python')
pos['label']=1
print(f"Loaded POS data, correctly interpreted 1-tweet-per-line fashion : {pos.shape[0]==1_250_000}")
neg = pd.read_table(data_path + "train_neg_full.txt", sep='.\n', names=['tweet'], engine='python')
neg['label']=-1
tweets = pos.merge(neg, how='outer')


# Helper functions
abbr = {
  # SMS abbreviations
  'omg': 'oh my god',
  'afk':'away from keyboard',
  'bf':'boyfriend',
  'bff':'best friend forever',
  'lol' : 'laughing out loud',
  'irl' : 'in my opinion',
  'gf' :'girlfriend',
  'idk' : "i don't know",
  'fyi':'for your information',
  'asap' : 'as soon as possible',
  'yolo':'you live only once',
  'smh':'shaking my head',
  'btw' : 'by the way',
  'otw':'on the way',
  'msg':'message',
  'ppl' : 'people',
  'np' : 'no problem',
  'imy':'i miss you',
  'jk' : 'just kidding',
  'fyi' : 'for your information',
  'idc' : "i don't care",
  'gg' : 'good game',
  'thx' : 'thanks',
  'lmao' : 'laughing my ass off',
  'ily':'i love you',
  'rofl' : 'rolling on floor laughing',
  'stfu' : 'shut the fuck up',
  'y' : 'you',
  'yolo':'you only live once',
  'wtf' : 'what the fuck',
  'wth':'what the hell',
  # smileys
  ':|' : "i'm indecisive",
  ':[' : "i'm sad",
  ':@' : "i'm angry",
  ':{' : "i'm sad",
  'xd' : "i'm laughing",
  ':/' : "i'm skeptical",
  ':p' : "i'm cheeky",
  ':d' : "i'm smiling",
  ':$' : "i'm embarrassed",
  ":')" : "i'm joyful",
  '=)' : "i'm smiling",
  'd:' : "i'm smiling",
  'xx' : 'two kisses',
  'xxx' : 'hugs and kisses',
  'xoxo' : 'hugs and kisses',
  ':o' : "i'm surprised",
  '<3' : 'love'
 }

def formalize(tweet):
    return ' '.join([abbr.get(x, x) for x in tweet.split()])

# Define Preprocessors

def clean_HTML_tags(series) :
    return series.str.replace('<\/*[a-zA-Z]+>', '', regex=True)

def pretrain_process(pos_,neg_):
    pos = pos_.copy()
    neg = neg_.copy()

    # Drop duplicates
    pos = pos.drop_duplicates()
    neg = neg.drop_duplicates()
    # Balance classes after having deleted duplicates
    min_size = min(pos.shape[0], neg.shape[0])
    pos = resample(pos, n_samples=min_size, replace=False)
    neg = resample(neg, n_samples=min_size, replace=False)

    pos['tweet'] = clean_HTML_tags(pos['tweet']).values
    neg['tweet'] = clean_HTML_tags(neg['tweet']).values
    pos['tweet'] = list(map(lambda x : formalize(x), pos['tweet'].values))
    neg['tweet'] = list(map(lambda x : formalize(x), neg['tweet'].values))

    return pos.merge(neg, how='outer')

def preprocessing_test(test_):
    test = test_.copy()
    test['tweet'] = clean_HTML_tags(test['tweet']).values
    test['tweet'] = list(map(lambda x : formalize(x), test['tweet'].values))
    return test

def run_one_cycle(model, tweets, n_samples, lr, batch_size, n_epochs=1):
'''
Fits the model with the given parameters using a cyclical learning rate
'''
    tweets_ = resample(tweets, n_samples=n_samples, replace=False)
    
    train_n_samples = int(n_samples*(1-0.01)) # nb of training samples
    train_df = tweets_[:train_n_samples]
    test_df = tweets_[train_n_samples:] # Validation set
    
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(tweets_,
                                                                   text_column='tweet',
                                                                   label_columns=['label'],
                                                                   val_df=test_df,
                                                                   preprocess_mode='bert', 
                                                                   lang='en',
                                                                   verbose=True
                                                                   )
    learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=batch_size)
    learner.fit_onecycle(lr, n_epochs)
    return learner

tweets = pretrain_process(pos, neg)

# Parameters for the best accuracy
learning_rates=[8e-4,2e-5,2e-5,2e-5,1e-5,1e-5,1e-5, 5e-6, 5e-6, 5e-7]
nb_samples=[10_000,30_000,30_000,30_000,30_000,30_000,30_000,30_000,30_000,30_000]


# Initialize the model 
initial_n_samples = 10_000
batch_size=6

tweets_ = resample(tweets, n_samples=initial_n_samples, replace=False)
train_n_samples = int(initial_n_samples*(1-0.01))
train_df = tweets_[:train_n_samples]
test_df = tweets_[train_n_samples:]

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(tweets_,
                                                                   text_column='tweet',
                                                                   label_columns=['label'],
                                                                   val_df=test_df,
                                                                   preprocess_mode='bert', 
                                                                   lang='en',
                                                                   verbose=True
                                                                   )
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

for i in range(0,10):
    learner = run_one_cycle(model, tweets, nb_samples[i], learning_rates[i], batch_size) 
    
# Load the testing data
test = pd.read_fwf(data_path + "test_data.txt", sep="\n", header=None)

# To format the testing data
def extract_tweet(tweet):
    return tweet.split(",", 1)[1]

# Formatting for AI Crowd
test.index = pd.RangeIndex(start=1, stop=10001, step=1) 
test = test[0].map(extract_tweet)
test = pd.DataFrame(test)
test.columns = ['tweet']
test= preprocessing_test(test)
test_values=test['tweet'].values
# Make the predictions
test['label']=p.predict(test_values)
## Export the result
test = test.drop('tweet', axis=1)
test.index.name='Id'
test = test.rename(columns={'label':'Prediction'})

with open(data_path + "submission.csv", 'w') as f:
    test.to_csv(f)