In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures, pipeline
from transformers import TFDistilBertModel, DistilBertTokenizer, TFAutoModel, AutoTokenizer, RobertaTokenizer, TFRobertaModel, DistilBertTokenizerFast
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from tqdm import tqdm
from helpers import get_sent_embe
from helpers import loadData

### Load datasets
First, we use the loadData function to load the training observations into pandas DataFrames

In [2]:
POS_TRAINING_PATH = 'twitter-datasets/train_pos.txt'
NEG_TRAINING_PATH = 'twitter-datasets/train_neg.txt'
POS_TRAINING_PATH_FULL = 'twitter-datasets/train_pos_full.txt'
NEG_TRAINING_PATH_FULL = 'twitter-datasets/train_neg_full.txt'

TEST_PATH = 'twitter-datasets/test_data.txt'
load_features_distilBERT = False
use_distilBERT = True
load_features_BERTweet = False
use_BERTweet = False
load_features_RoBERTa = False
use_RoBERTa = False

# Load both small and large training sets
# df_train_small = loadData(POS_TRAINING_PATH, NEG_TRAINING_PATH)
df_full = loadData(POS_TRAINING_PATH_FULL, NEG_TRAINING_PATH_FULL)
# df_full.head()

Since for the test data we need to save the indexing separately and we shoud not dropping possible duplicates, we do not use the loadData function.

In [3]:
# Load the test set.
test_sent = []
idx = []
with open(TEST_PATH) as test:
    for line in test:
        split = line.split(",", 1)
        idx.append(int(split[0]))
        test_sent.append(split[1])

data = {'index':idx,'tweet':test_sent}
df_test = pd.DataFrame(data)
df_test.head()

Unnamed: 0,index,tweet
0,1,sea doo pro sea scooter ( sports with the port...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby\n
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


Check if GPU is available, in which case we will use this to improve the runnning time

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [1]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

['/device:GPU:0']


In [4]:
tf.test.is_built_with_cuda()

True

### DistilBERT

In [4]:
if use_distilBERT==True:
    if load_features_distilBERT==True:
        # initialize DistilBERT tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

        # obtain sentence representations
        features_np_full = get_sent_embe(df_full, tokenizer, model)
        features_test_np = get_sent_embe(df_test, tokenizer, model)

        # transform features into Pandas DataFrame
        distilBERT_pd_full = pd.DataFrame(data=features_np_full)
        distilBERT_pd_test = pd.DataFrame(data=features_test_np)

        # Save the features as a pickle file to prevent running the feature extraction every time
        distilBERT_pd_test.to_pickle("./DistilBERT_features_test.pkl", compression='zip')
        distilBERT_pd_full.to_pickle("./DistilBERT_features_full.pkl", compression='zip')
    else:
        # Load the pickle file with the sentence representations
        features_np_full = pd.read_pickle("./DistilBERT_features_full.pkl", compression='zip').to_numpy()
        # features_np_small = pd.read_pickle("./DistilBERT_features.pkl", compression='zip').to_numpy()
        # features_test_np = pd.read_pickle("./DistilBERT_features_test.pkl", compression='zip').to_numpy()

### RoBERTa pretrained features

In [5]:
if use_RoBERTa==True:
    if load_features_RoBERTa==True:
        # initialize DistilBERT tokenizer and model
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaModel.from_pretrained('roberta-base')

        # obtain sentence representations
        features_np_full = get_sent_embe(df_full, tokenizer, model)
        features_test_np = get_sent_embe(df_test, tokenizer, model)

        # transform features into Pandas DataFrame
        RoBERTa_pd_full = pd.DataFrame(data=features_np_full)
        RoBERTa_pd_test = pd.DataFrame(data=features_test_np)

        # Save the features as a pickle file to prevent running the feature extraction every time
        RoBERTa_pd_test.to_pickle("./RoBERTa_features_test.pkl", compression='zip')
        RoBERTa_pd_full.to_pickle("./RoBERTa_features_full.pkl", compression='zip')
    else:
        # Load the pickle file with the sentence representations
        features_np_full = pd.read_pickle("./RoBERTa_features_full.pkl", compression='zip').to_numpy()
        # features_np_small = pd.read_pickle("./DistilBERT_features.pkl", compression='zip').to_numpy()
        features_test_np = pd.read_pickle("./RoBERTa_features_test.pkl", compression='zip').to_numpy()

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(2270482, 137)
iteration:  0


ResourceExhaustedError: Exception encountered when calling layer "intermediate" (type TFRobertaIntermediate).

failed to allocate memory [Op:RealDiv]

Call arguments received:
  • hidden_states=tf.Tensor(shape=(200, 137, 768), dtype=float32)

In [24]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
features_test_np = get_sent_embe(df_test, tokenizer, model)

# Save the features as a pickle file to prevent running the feature extraction every time
distillBERT_test_pd = pd.DataFrame(data=features_test_np)
distillBERT_test_pd.to_pickle("./DistilBERT_test_features.pkl", compression='zip')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


(10000, 65)
iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15
iteration:  16
iteration:  17
iteration:  18
iteration:  19
iteration:  20
iteration:  21
iteration:  22
iteration:  23
iteration:  24
iteration:  25
iteration:  26
iteration:  27
iteration:  28
iteration:  29
iteration:  30
iteration:  31
iteration:  32
iteration:  33
iteration:  34
iteration:  35
iteration:  36
iteration:  37
iteration:  38
iteration:  39
iteration:  40
iteration:  41
iteration:  42
iteration:  43
iteration:  44
iteration:  45
iteration:  46
iteration:  47
iteration:  48
iteration:  49
iteration:  50
iteration:  51
iteration:  52
iteration:  53
iteration:  54
iteration:  55
iteration:  56
iteration:  57
iteration:  58
iteration:  59
iteration:  60
iteration:  61
iteration:  62
iteration:  63
iteration:  64
iteration:  65
iteratio

In [63]:
labels = df_full['sentiment']

In [73]:
features_np_full.shape

(2270482, 768)

In [7]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features_np_small, df_train_small['sentiment'], train_size=0.8, random_state=123)

In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(train_features, train_labels)
clf.score(test_features, test_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8097063284158279

In [65]:
y_pred = clf.predict(features_test_np)

In [77]:
idx = np.asarray(df_test.index.values.tolist()) +1
idx

array([    1,     2,     3, ...,  9998,  9999, 10000])

In [18]:
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in .csv format for submission to Kaggle or AIcrowd
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [96]:
y_pred[y_pred==0] = -1

In [97]:
y_pred

array([-1, -1, -1, ..., -1,  1, -1], dtype=int64)

In [98]:
OUTPUT_PATH = 'LogisitcRegression_DistilBERTemb_full_MLP_3'
create_csv_submission(idx, y_pred, OUTPUT_PATH)

In [74]:
from sklearn.ensemble import HistGradientBoostingClassifier
gb_clf = HistGradientBoostingClassifier(scoring='accuracy', max_depth=50, loss='binary_crossentropy', learning_rate=1, max_iter=100, l2_regularization=0.1).fit(train_features, train_labels)
gb_clf.score(test_features, test_labels)

0.8023990468996712

In [25]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, learning_rate=0.05, n_estimators=200)
xgb_model.fit(train_features, train_labels)

y_pred = xgb_model.predict(test_features)

print(accuracy_score(test_labels, y_pred))



0.7975458430994071


In [9]:
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(random_state=123, max_iter=300, solver='adam', alpha=0.0001, learning_rate='adaptive', early_stopping=True).fit(train_features, train_labels)
clf_mlp.score(test_features, test_labels)

0.8236867503102164

In [26]:
from sklearn.neural_network import MLPClassifier
mlp_gs = MLPClassifier(max_iter=500)
parameter_space = {
    'hidden_layer_sizes': [(256,128,32), (256,32), (128, 32), (32,)],
    'activation': ['logistic', 'relu'],
    #'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.05, 0.01],
    'learning_rate': ['constant','adaptive'],
    'early_stopping': [True, False],
}
from sklearn.model_selection import RandomizedSearchCV
clf = RandomizedSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, scoring='f1')
clf.fit(features_np_small, df_train_small['sentiment'])

30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\lucaz\pycharmprojects\pythonproject1\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\lucaz\pycharmprojects\pythonproject1\venv\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 752, in fit
    return self._fit(X, y, incremental=False)
  File "c:\users\lucaz\pycharmprojects\pythonproject1\venv\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 384, in _fit
    self._validate_hyperparameters()
  File "c:\users

RandomizedSearchCV(cv=5, estimator=MLPClassifier(max_iter=500), n_jobs=-1,
                   param_distributions={'activation': ['sigmoid', 'relu'],
                                        'alpha': [0.0001, 0.001, 0.05, 0.01],
                                        'early_stopping': [True, False],
                                        'hidden_layer_sizes': [(256, 128, 32),
                                                               (256, 32),
                                                               (128, 32),
                                                               (32,)],
                                        'learning_rate': ['constant',
                                                          'adaptive']},
                   scoring='f1')

In [95]:
y_pred = clf_mlp.predict(features_test_np)

In [15]:
clf_mlp = MLPClassifier(activation='relu', hidden_layer_sizes= (64), random_state=123, max_iter=300, solver='adam', alpha=0.001, learning_rate_init= 0.005, beta_1= 0.9, beta_2= 0.999, early_stopping=True).fit(features_np_full, df_full['sentiment'])

y_pred = clf_mlp.predict(features_test_np)

In [17]:
y_pred[y_pred==0] = -1
y_pred

array([-1, -1, -1, ..., -1,  1, -1], dtype=int64)

In [19]:
idx = df_test['index']

In [20]:
OUTPUT_PATH = 'MLP_DistilBERTemb_full'
create_csv_submission(idx, y_pred, OUTPUT_PATH)

In [88]:
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
lgbm_model = lgbm.LGBMClassifier(objective="binary", random_state=42, learning_rate=0.05, n_estimators=200)
lgbm_model.fit(train_features, train_labels)

y_pred = lgbm_model.predict(test_features)

print(accuracy_score(test_labels, y_pred))

0.7916722638555198
