## 1. Set-up

In [None]:
!pip install -U simpletransformers sentence-transformers transformers -q
!pip install emoji unidecode pattern -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/250.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.2 MB/s[

Copy datasets from Google Drive To Local VM to Avoid Disconnection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -r /content/data
!cp -r /content/drive/MyDrive/nlp/zindi/data /content

rm: cannot remove '/content/data': No such file or directory


Load Python Libraries

In [None]:
# data manipulation
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import *
import sklearn

# natural language processing - simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# natrual language processing - transformers
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig, get_linear_schedule_with_warmup

# natural language processing -sentence transformers
from sentence_transformers import SentenceTransformer, util

# natural language processing - other libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.python.keras.callbacks import *

# text wragling
import re, string
from bs4 import BeautifulSoup
import emoji

# other libraries
import os, gc, tqdm, datetime, random

# customize printing
import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', None)

Environment Variables

In [None]:
seed = 1024

TRAIN_DATA_PATH = '/content/data/Train.csv'
TEST_DATA_PATH = '/content/data/Test.csv'
SAMPLE_SUB_PATH = '/content/data/SampleSubmission.csv'

TIMESTAMP = str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
print(TIMESTAMP)

20230413-073719


In [None]:
# create a folder under local vm to save model results
os.makedirs('/content/models/', exist_ok=True)

Define Utility Functions
- [x] post-processing on predictions
- [x] save output to .csv

In [None]:
def postprocess_classification(preds):
  '''
    This function helps us go from a classifiaction
    problem to a regression one.
    The regression values range are in [-1, 1].
  '''
  final_preds = []
  for pred in preds:
    argmax = np.argmax(pred, axis=0)
    if argmax == 0: final_preds.append( -1*pred[0] )
    elif argmax == 1: final_preds.append( 0 )
    else: final_preds.append( pred[2] )
    
  return final_preds

In [None]:
def postprocess_regression(preds):
  '''
    This function clips the regression model output between range [-1, 1].
  '''
  preds_clipped = np.clip(preds, -1, 1)   
  return preds_clipped

In [None]:
def rmse(true, pred):
  '''This function calculates root mean squared error'''
  return np.sqrt(mean_squared_error(true, pred))

In [None]:
def prep_submission(df_pred, path_vm, path_gdrive, model_name, model_details, datetime):
  '''This function prepares submission file by
  1) save to .csv on local vm & designated folder on google drive
  2) download to local system 
  '''
  # save results to local vm and google drive
  filename = f'senti-reg_{model_name}_{model_details}_{datetime}.csv'
  df_pred.to_csv(os.path.join(path_vm, filename), index=False)
  df_pred.to_csv(os.path.join(path_gdrive,'/models', filename))

  # download to local system
  from google.colab import files
  files.download(os.path.join(path_vm, filename))

Import Datasets

In [None]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

In [None]:
print('-'*10,'train','-'*10)
train.info()
train.sample(10)
print('-'*10,'test','-'*10)
test.info()
test.sample(10)
print('-'*10, 'sample submission', '-'*10)
sample_sub.info()
sample_sub.head(3)

---------- train ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


Unnamed: 0,tweet_id,safe_text,label,agreement
8185,5PECT065,Starting all my immunizations today. It takes 6 months to complete all the ones you need for nursing school. Good thing I don't mind needles,1.0,0.666667
9258,0SLKX6BM,NO Kim and Kanye are not bad parents for piercing their daughters ears. It can be safely done after they receive certain immunizations.,1.0,0.666667
2484,V8YE8ZGY,San Bernardino County Measles Exposure Warning <url>,1.0,0.666667
6677,YA8OY2XV,"Clearly, nothing is happening in the news worth reporting considering we are talking about the measles, 24/7.",0.0,0.666667
7131,0U71H5Y2,<user> Not sure we want to label the measles vaccine a TOXIN!!! #woefullymisinformed,1.0,0.666667
6661,VJ8CN0U2,"""WHO estimates that today immunizations prevent between 2 and 3 million deaths annually and protect many more"" <url>",1.0,1.0
1318,S1LRHP1O,“<user> Damn the whole #MMR team came out #SkyboxLounge”,0.0,1.0
1422,0N4UIEW4,Great questions on school immunization requirements and licensed pharmacy interns at #HB817 hearing. #pharmacy,0.0,0.666667
8173,8VO17XGY,#choice for measles vaccinations #nochoice for unplanned pregnancies #ChrisChristie,-1.0,0.666667
7845,8A83J3DJ,Obama says vaccinate your children. Thanks Obama. I had just put my 1st down payment on an iron lung. #really? #arrogantignorance,-1.0,0.333333


---------- test ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   5177 non-null   object
 1   safe_text  5176 non-null   object
dtypes: object(2)
memory usage: 81.0+ KB


Unnamed: 0,tweet_id,safe_text
2749,J3504GX5,<user> just this morning read of another celebrity refusing to vaccinate her kid because of autism fear
610,45LYPA6V,"Funny, when I was a kid I loved trains. Now, with NJ Transit, I treat them like vaccinations: tedious but necessary."
1468,A351A9ON,<user> #GOP STOP blaming #Immigrants first #Ebola now #measles Which diseases?brought their ancestors?#AINF <url>
3446,NS7AEN9U,Baby Nickson gone too soon. Sacrificed for myth of herd immunity #VaxTruth #CDCwhistleblower #hearthiswell <url>
2988,KOKLGE80,"Given the health climate of the world we all should boost immunity.Eat 2 Brazil nuts a day.No more.Contain selenIum,precursor to glutathione"
2410,GQ89V5C3,<user> <user> <user> Why are anti vaccine people responsible for 28 dead?
2177,F3O79UGT,"<user> Looks like it includes the standard: MMR, Polio, etc."
3125,LK5BE4FU,Life different when you M.A.D.E. ....... #MMR #BBoyEnt #humblebeginnings #teamfreepour #aftonshows… <url>
1491,A856A547,"“<user> By 2030, we'll cut child deaths by half—and eradicate more diseases than ever before: <url> vaccines!"
4505,V4YSDQJG,"BART Riders, Patrons Possibly Exposed to Measles: Officials in Northern California warn Bay Area Rapid Transit… <url>"


---------- sample submission ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  5177 non-null   object
 1   label     5177 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 81.0+ KB


Unnamed: 0,tweet_id,label
0,00BHHHP1,0
1,00UNMD0E,0
2,01AXPTJF,0


In [None]:
# check data imbalance
pd.DataFrame({'num_of_instances':train.label.value_counts(dropna=False), 
              '%':train.label.value_counts(normalize=True,dropna=False)})

Unnamed: 0,num_of_instances,%
0.0,4908,0.490751
1.0,4053,0.405259
-1.0,1038,0.10379
,1,0.0001
0.666667,1,0.0001


In [None]:
pd.DataFrame({'num_of_instances':train.agreement.value_counts(dropna=False),
              '%':train.agreement.value_counts(normalize=True,dropna=False)})

Unnamed: 0,num_of_instances,%
1.0,5866,0.586541
0.666667,3894,0.389361
0.333333,239,0.023898
,2,0.0002


In [None]:
# check missisng values
train.isnull().sum()
train[train.isnull().any(axis=1)]

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

Unnamed: 0,tweet_id,safe_text,label,agreement
4798,RQMQ0L2A,#lawandorderSVU,,
4799,I cannot believe in this day and age some parents could be so oblivious to reality as to not #vaccinate their child.,1,0.666667,


In [None]:
test.isnull().sum()
test[test.isnull().any(axis=1)]

tweet_id     0
safe_text    1
dtype: int64

Unnamed: 0,tweet_id,safe_text
2024,Dr. JAMES SHANNON,


In [None]:
# make sure all tweet_id in test file shows up in sample submission file
test[~test['tweet_id'].isin(sample_sub['tweet_id'])]
sample_sub[~sample_sub['tweet_id'].isin(test['tweet_id'])]

Unnamed: 0,tweet_id,safe_text
2024,Dr. JAMES SHANNON,


Unnamed: 0,tweet_id,label
2452,H0VUUY2P,0


# 1. Data Cleaning
- [X] Remove NaN rows
- [X] Because some of the tweets were annotated multiple times by the same annotator, there can be duplicated rows. 
  - We first need to drop all rows, but one, with duplicated tweets where HandLabel is the same. 
  - After that we drop all duplicated tweets, since they all have different HandLabel and we do not know which one is correct. It would have been wrong if we dropped all the duplicates at once, without looking at the HandLabel, because we would threw away the highest quality data (the tweets which were labeled same multiple times).
- [x] All tweets are converted to lowercase
- [X] All links were removed since they do not contain any relevant information for this task and also '[video]' and '{link}' strings were removed because Twitter sometimes converts links to to these keywords.
- [X] A lot of tweets are usually retweets, that means that they contain 'RT @tweet_user' keywords, since 'RT @' is of no use it is replaces by '@'. ('@' is kept as indicator of tweet_user, because we will be removing them in the following steps). 
- [X] All usernames are removed. Usernames are words that start with '@'.
- [X] Dealing with hashtags: Hashtag symbol '#' is removed, but the words that follow that symbol are kept, since they usually contain a lot of useful information (they are usualy compressed representation of the tweet)



*Reference*:
- [Top 2 PyTorch + SimpleTransformers Solutions](https://github.com/rajat5ranjan/Zindi-Solutions/blob/master/To%20Vaccinate%20or%20Not%20to%20Vaccinate%20%23ZindiWeekendz/final_sub.ipynb)
- [Twitter-Sentiment-Analysis-RoBERTa](https://github.com/Data-Science-kosta/Twitter-Sentiment-Analysis-RoBERTa)
- [Twitter Sentiment Analysis with BERT + RoBERTa](https://www.kaggle.com/code/ludovicocuoghi/twitter-sentiment-analysis-with-bert-roberta)
- [Pre-Processing Tweets for Sentiment Analysis](https://medium.com/analytics-vidhya/pre-processing-tweets-for-sentiment-analysis-a74deda9993e)

In [None]:
train_clean = train.copy()
test_clean = test.copy()

### a. Missing values treatment

In [None]:
# training set
train_clean.iloc[4799,:] = ['MADEUPID', 'I cannot believe in this day and age some parents could be so oblivious to reality as to not #vaccinate their child.', 1.0, 0.666667]
train_clean.drop(index=[4798], inplace=True)
train_clean = train_clean[train_clean['label'].isin([-1.0, 0.0, 1.0])]
train_clean.reset_index(drop=True, inplace=True)

In [None]:
# test set
test_clean['safe_text'] = test_clean['safe_text'].fillna('Dr. JAMES SHANNON')
test_clean['tweet_id'] = np.where(test_clean['tweet_id']=='Dr. JAMES SHANNON', 'H0VUUY2P', test_clean['tweet_id'])

In [None]:
# audit - no missing values in both training and test sets
train_clean.isnull().sum()
test_clean.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

tweet_id     0
safe_text    0
dtype: int64

### b. Check Duplicates

In [None]:
def remove_dups(df, dataset = 'train'):
  '''
  This function removes duplicates by take two steps approach:
    step 1 We first need to drop all rows, but one, with duplicated tweets where label is the same.
    step 2 After that we drop all duplicated tweets, since they all have different label and we do not know which one is correct. 
  '''
  clean_df = df.copy()
  n_rows = len(clean_df)

  if dataset == 'train':
    # step 1 - drop duplicated tweets with same label
    clean_df = clean_df.drop_duplicates(subset=['safe_text','label'], keep='first')
    print(f'Dropped {n_rows - len(clean_df)} duplicate rows with same label')

    # step 2 - there are only duplicates with different hand label, so drop both of them
    n_rows = len(clean_df)
    clean_df = clean_df.drop_duplicates(subset=['safe_text'], keep=False)
    print(f'Dropped {n_rows - len(clean_df)} duplicate rows with different labels')

    # make sure no duplicates
    assert clean_df['safe_text'].nunique() == clean_df.shape[0], 'duplicates still exist'
    
  print("Length of clean {} set is {}".format(dataset, len(clean_df)))  
  return clean_df

In [None]:
train_clean = remove_dups(train_clean, 'train')
test_clean = remove_dups(test_clean, 'test')

Dropped 307 duplicate rows with same label
Dropped 72 duplicate rows with different labels
Length of clean train set is 9621
Length of clean test set is 5177


In [None]:
# understand text statistics - text length
train_clean['safe_text'].apply(lambda x: len(x)).describe()
print('\n')
test_clean['safe_text'].apply(lambda x: len(x)).describe()

count    9621.000000
mean      100.176697
std        29.813909
min         3.000000
25%        80.000000
50%       107.000000
75%       122.000000
max       153.000000
Name: safe_text, dtype: float64





count    5177.000000
mean       99.506471
std        29.994068
min         6.000000
25%        80.000000
50%       106.000000
75%       122.000000
max       151.000000
Name: safe_text, dtype: float64

### c. Tweets Text Data Cleaning

In [None]:
def preprocess_tweets(df, params):
  clean_df = df.copy()

  #----------
  # define helper functions
  #----------
  # [Y] recommend to include
  def remove_user_mentions(clean_df):
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.compile('rt @').sub('@', x).strip())      # substitute 'RT @' with '@'
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.sub('\@[ a-zA-Z0-9]*', '', x))            # Remove usernames. The usernames are any word that starts with @.
    return clean_df

  def remove_extra_white_space(clean_df):
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.strip('.').strip())
    return clean_df

 # [tentative] 
  def remove_tweet_special_chars(clean_df):
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace('#', ' '))
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace('\r', ' '))
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace('\n', ' ')) # BERT may USE \r \n as sentence delimiters
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace('&amp;', ' '))
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace('_', ' '))
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: x.replace(':', ' '))
    return clean_df

  # [NO] recommend not include
  def remove_numbers(clean_df):
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x:re.sub(r'\d+', '', x).strip())
    return clean_df

  def remove_punctuation(clean_df):
    pattern = r'[^\w\s]' 
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.sub(pattern,'', x))
    clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    return clean_df

  def replace_weird_chars(clean_df):
    from unidecode import unidecode
    clean_df['safe_text'] = clean_df['safe_text'].apply(unidecode)
    #clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x)) # [not working, maybe related to emoji] remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    return clean_df
  
  def url_to_words(raw_text):
    raw_text=str(raw_text).strip()
    soup = BeautifulSoup(raw_text, 'html.parser')
    raw_text = soup.text
    raw_text = emoji.demojize(raw_text)
    no_coms=re.sub(r'\.com','',raw_text)
    no_urls=re.sub('https?://www','',no_coms)
    no_urls1=re.sub('https?://','',no_urls)
    try:
        no_encoding=no_urls1.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        no_encoding = no_urls1
    # letters_only = re.sub("[^a-zA-Z0-9]", " ",no_encoding) 
    words = no_encoding.lower().split()                             
    # stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words] 
    return( " ".join( meaningful_words ))

  def process_eng_hashtag(input_text: str) -> str:
    return re.sub(r'#[a-z|A-Z]\S*',lambda m: ' '.join(re.findall('[A-Z][^A-Z]*|[a-z][^A-Z]*', m.group().lstrip('#'))), input_text)

  def lowercase(clean_df):
    clean_df['safe_text'] = clean_df['safe_text'].apply(str.lower)
    return clean_df

  def remove_patterns(df):
    #clean_df['safe_text'] = clean_df['safe_text'].str.replace('<user>|<url>|url|user|https?://\S+|www\.\S+|#\w+', '', regex=True)
    clean_df['safe_text'] = clean_df['safe_text'].str.replace('<user>|<url>', ' ', regex=True)
    return clean_df
  
  #----------
  # data preprocessing
  #----------
  # [yes] remove user mentions
  if params.get('remove_user_mentions', False): clean_df = remove_user_mentions(clean_df) 

  # [yes] remove extra space
  if params.get('remove_extra_space', False): clean_df = remove_extra_white_space(clean_df)

  # [no] convert url to words
  if params.get('url_to_words', False): clean_df['safe_text'] = clean_df['safe_text'].apply(url_to_words)

  # [no] split hashtag - has limitation on ALLCAPITALIZED words
  if params.get('process_eng_hashtag', False): clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: process_eng_hashtag(x))

  # [no] remove weird characters
  if params.get('replace_weird_chars', False): clean_df = replace_weird_chars(clean_df)

  # [no] remove numbers
  if params.get('remove_numbers', False): clean_df = remove_numbers(clean_df)

  # [no] remove punctuation
  if params.get('remove_punctuation', False): clean_df = remove_punctuation(clean_df)

  # [no for uncased model] convert to lowercase
  if params.get('lowercase', False): clean_df = lowercase(clean_df) 

  # [tentative] remove_tweet_special_chars
  if params.get('remove_tweet_special_chars', False): clean_df = remove_tweet_special_chars(clean_df) 

  # [good to keep] remove repeated patterns in tweets, such as <url>, <user>
  if params.get('remove_patterns', False): clean_df = remove_patterns(clean_df)

  return clean_df

In [None]:
params_preprocess = {
                     'remove_user_mentions':True,  # [YES]no.1 Boost performance, I'd like to keep. model does not analyze user name
                     'remove_extra_space':False,    # [YES]no.2 Boost performance, I'd like to keep
                     'remove_tweet_special_chars':False, # [TENTATIVE YES]no.3 boost performance, but cannot combine with remove_user_mentions
                     'lowercase':False,             # [NO] depends on models, tokenizer can take care of it
                     'remove_patterns':False,       # [NO] worse than do nothing, 0.50
                     'url_to_words':False,          # [NO] worse than do nothing
                     'split_hashtags':False,        # [NO] worse than do nothing. BERT can take care of hashtag based on internet discussion 
                     'replace_weird_chars':False,   # [NO] if apply unicode, not as good as do nothing; if remove non-acsii chars, even worse. [do not include]
                     'remove_numbers':False,        # [NO] worse than do nothing 
                     'remove_punctuation':False,    # [NO] worse than do nothing
                     }

In [None]:
train_clean = preprocess_tweets(train_clean, params_preprocess)
test_clean = preprocess_tweets(test_clean, params_preprocess)

In [None]:
# audit
from google.colab import data_table
data_table.enable_dataframe_formatter()

train_clean
test_clean

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE . <url>,0.0,1.000000
1,E3303EME,I'm 100% thinking of devoting my career to proving autism isn't caused by vaccines due to the IDIOTIC posts I've seen about World Autism Day,1.0,1.000000
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE YOUR CHILD",-1.0,1.000000
3,1DR6ROZ4,"I mean if they immunize my kid with something that won't secretly kill him years down the line then I'm all for it, but I don't trust that",-1.0,1.000000
4,J77ENIIE,Thanks to <user> Catch me performing at La Nuit NYC 1134 1st ave. Show starts at 6! #jennifair #mmr… <url>,0.0,1.000000
...,...,...,...,...
9995,IU0TIJDI,Living in a time where the sperm I used to waste on Jenny McCarthy is doing better than some of the sperm that became kids. #vaccineswork,1.0,1.000000
9996,WKKPCJY6,"<user> <user> In spite of all measles outbreaks, judge in MI threatens to put father in jail if I vaccinate.",1.0,0.666667
9997,ST3A265H,Interesting trends in child immunization in Oklahoma from <user> covering the <user> meeting.,0.0,1.000000
9998,6Z27IJGD,CDC Says Measles Are At Highest Levels In Decades: (<url> have returned in the U.S. to... <url>,0.0,1.000000


Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,"<user> <user> ... &amp; 4 a vaccine given 2 healthy peeps, FDA think just not worth the AE risk unfortunately."
1,00UNMD0E,Students starting school without whooping cough vaccinations <url> #scpick
2,01AXPTJF,"I'm kinda over every ep of <user> being ""ripped from the headlines."" Measles? Let's get back to crime. #SVU"
3,01HOEQJW,How many innocent children die for lack of vaccination each year? Around 1.5 million. Too bad all their parents couldn't be here. #SB277
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though risk is low: Federal officials said Wednesday they're taking steps… <url>"
...,...,...
5172,ZXVVNC5O,jenny mccarthy is on new years rockin eve. what has she done lately besides not vaccinate her kids and give us all goddamn polio??
5173,ZYIANVI8,Measles reported in Clark Co. for 1st time since 2011 <url>
5174,ZYITEHAH,"<user> issues alert regarding Measles in TX. Keep your DDx up to date, people! #Emergencymedicine"
5175,ZZ3BMBTG,I can't believe people don't vaccinate their kids! I've been vaccinated for everything and then some.


In [None]:
# # optional - spell check
# def spell_check(clean_df):
#   from pattern.en import suggest
#   clean_df['safe_text'] = clean_df['safe_text'].apply(lambda x: ' '.join(pattern.en.suggest(word)[0][0] if pattern.en.suggest(word) else word for word in x.split()))
#   return clean_df

#====================
# from tqdm import tqdm
# from spellchecker import SpellChecker

# spell = SpellChecker()
# def correct_spelling_errors(df):
#     df_copy = df.copy()
#     df_copy['safe_text'] = df_copy['safe_text'].astype(str)
#     corrected_text = []
#     for i, row in tqdm(df_copy.iterrows(), total=len(df_copy)):
#         words = row['safe_text'].split()
#         corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
#         row['safe_text'] = ' '.join(corrected_words)
#         corrected_text.append(' '.join(corrected_words))
#     df_copy['safe_text'] = corrected_text
#     return df_copy

### d. Train/Validation/Test Split

In [None]:
def extract_text_and_y(df_train, df_test):

    # Training - This extraces the Train datasentece columns and pushes into variable text and one hot encodes the labels
    X_train_all = df_train['safe_text'].apply(str)
    y_train_all = df_train['label']

    #Test - This extracts the Test datasentence columns and pushes to a text
    X_test = df_test['safe_text'].apply(str)

    return X_train_all, y_train_all, X_test

In [None]:
X_train_all, y_train_all, X_test = extract_text_and_y(train_clean, test_clean)

# audit
print(len(X_train_all))
print(len(X_test))

9621
5177


In [None]:
# train validation split
def split_data(X_train_all, y_train_all, test_size, seed=seed):
  X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, 
                                                    test_size=test_size,
                                                    stratify=y_train_all, 
                                                    random_state=seed)
  print('number of instances in training set:{}'.format(len(X_train)))
  print('number of instances in validation set:{}'.format(len(X_val)))
  return X_train, X_val, y_train, y_val

In [None]:
X_train, X_val, y_train, y_val = split_data(X_train_all, y_train_all, test_size=0.15)

number of instances in training set:8177
number of instances in validation set:1444


In [None]:
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)
df_test = pd.DataFrame({'safe_text':X_test})

# 2. Baseline Model

*Reference*:
- [ClassificationModel Simple Transformers](https://simpletransformers.ai/docs/classification-models/)
- [Configure Simple Transformer Models](https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model)
- [roberta-large hugging face document](https://huggingface.co/roberta-large)
- [simpletransformer official github examples - text classification](https://github.com/ThilinaRajapakse/simpletransformers/blob/master/examples/text_classification/yelp_reviews_polarity/train.py)
- [simpletransformers fine-tuning early stopping & checkpionts](https://www.youtube.com/watch?v=k8VAfWI6iAw)

In [None]:
# Load your preprocessed data and embeddings
# Define your model architecture
from simpletransformers.classification.classification_model import ClassificationModel, ClassificationArgs

def build_simple_model(model_type, model_name, X_train, num_classes, params={}):
    
    train_batch_size = params['train_batch_size']
    eval_batch_size = params['eval_batch_size']
    n_epochs= params['n_epochs']
    max_seq_len = 72
    lr = params['learning_rate']

    model_args = ClassificationArgs(
    train_batch_size = train_batch_size,
    eval_batch_size = eval_batch_size,
    reprocess_input_data = True,
    overwrite_output_dir = True,
    fp16 = True,
    do_lower_case = False,
    num_train_epochs = n_epochs,
    max_seq_length = max_seq_len,
    manual_seed = 128,
    learning_rate = lr,
    save_eval_checkpoints = True,
    save_model_every_epoch = True,
    regression= True,
    #num_classes = 1,
    use_multiprocessing=False,
    use_multiprocessing_for_evaluation=False,
    gradient_accumulation_steps=4,
    max_grad_norm = 1,   
    use_early_stopping = True,
    early_stopping_delta = 0.01,
    early_stopping_metric = "mcc",
    early_stopping_metric_minimize = False,
    early_stopping_patience = 5,
    
    )
  
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    #args.labels_list = [-1, 0, 1]

    print(model_type)
    print(model_name)

    model = ClassificationModel(model_type, model_name, num_labels=1, use_cuda=True, args=model_args)   
    return model

model_type = 'roberta'
model = 'roberta-large'

#model_type = 'xlnet'
#model = 'xlnet-base-cased'

# model_type ='xlmroberta'
# model = 'xlm-roberta-large'

#model_type = 'distilbert' 
#model = 'distilbert-base-uncased'

params = {'n_epochs': 2, 'train_batch_size': 16, 'eval_batch_size': 12, 'max_seq_len': 72, 'learning_rate': 2e-5}
simple_model = build_simple_model(model_type, model, df_train, num_classes=1, params=params)
simple_model.train_model(df_train[['safe_text','label']])

roberta
roberta-large


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/512 [00:00<?, ?it/s]

(256, 0.31704818874641205)

In [None]:
result, model_outputs, _ = simple_model.eval_model(df_val)
print(result)
print(model_outputs[0:10])

from sklearn.metrics import mean_squared_error as mse
def predict_sentiment(model, model_outputs):
    print(model)
    predicted = np.clip(model_outputs, -1, 1)   
    return predicted


test_pred_cont = predict_sentiment(model, model_outputs)
print(result)
#print('# Wrong predictions', len(wrong_predictions))
print(len(df_val['label']))
print((model_outputs.shape))
print(model_outputs[0:50])

print(f"RMSE: {mse(df_val['label'], test_pred_cont)**0.5}")

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

{'eval_loss': 0.23093815835977882}
[-0.33642578  0.05627441  0.76171875  0.04385376  0.734375    0.81152344
 -0.05197144  0.0140152   0.32617188  0.08972168]
roberta-large
{'eval_loss': 0.23093815835977882}
1444
(1444,)
[-0.33642578  0.05627441  0.76171875  0.04385376  0.734375    0.81152344
 -0.05197144  0.0140152   0.32617188  0.08972168  0.91455078  0.7265625
 -0.03909302  0.54931641 -0.0222168  -0.01265717  0.04498291  0.54199219
 -1.19335938  0.95703125  0.05596924 -0.10998535  0.65917969  1.02441406
  0.30688477 -0.35766602  0.87353516 -0.26660156  0.20056152  0.38232422
 -0.02757263  0.06433105  0.46679688  0.43408203  0.28417969  0.1940918
  0.12402344  0.14221191  0.10266113  0.68212891 -0.32714844  0.35546875
 -0.07067871  0.06176758 -0.00152493  0.77148438  0.98291016 -0.17687988
 -0.11962891  0.97851562]
RMSE: 0.4778628178486905


In [None]:
save_baseline = False

if mse(df_val['label'], test_pred_cont)**0.5 < 0.475 and save_baseline==True:
  test_data_list = list(test_clean['safe_text'].values)
  test_predictions, test_outputs = simple_model.predict(test_data_list)
  test_preds_cont = np.clip(test_predictions, -1, 1)
  df_test_preds_clipped = pd.DataFrame({'tweet_id':test_clean.tweet_id,'predicted_values':test_preds_cont})

  # customize name
  data_desc = 'clean_final'
  model_desc = 'model'+ model + '_'+ 'epochs' + str(params['n_epochs'])+'_'+'train_batch' + str(params['train_batch_size']) + '_' + 'lr' + str(params['learning_rate']) + '_' + 'seed' + str(seed)
  desc = data_desc + model_desc
  print(desc)

  # output results
  df_test_preds_clipped.to_csv('/content/test-'+desc+TIMESTAMP+'.csv',index=False)

#### Soft Voting Top Baseline Models

In [None]:
%cd /content/drive/MyDrive/nlp/zindi/outputs

/content/drive/MyDrive/nlp/zindi/outputs


In [None]:
%ls

fancy_ensemble.csv
rank-tracking.gsheet
test-no_cleanepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv
test-remove_extraspace.epochs2_train_batch16_lr2e-05_seed4220230413-043707.csv
test-remove_specialcharsepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv
test-remove_usermentionepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv
test-remove_userurlepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv


In [None]:
pv_1 = pd.read_csv('test-remove_usermentionepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv')
pv_2 = pd.read_csv('test-remove_extraspace.epochs2_train_batch16_lr2e-05_seed4220230413-043707.csv')
pv_3 = pd.read_csv('test-remove_extraspace.epochs2_train_batch16_lr2e-05_seed4220230413-043707.csv')
pv_4 = pd.read_csv('test-no_cleanepochs2_train_batch16_lr2e-05_seed4220230413-043707.csv')

In [None]:
sub = pd.DataFrame({
    'tweet_id':pv_1['tweet_id'],
    'label':pv_1['predicted_values']*0.4 + pv_2['predicted_values']*0.3 + pv_3['predicted_values']*0.15 + pv_4['predicted_values']*0.15
})

sub

Unnamed: 0,tweet_id,label
0,00BHHHP1,-0.156852
1,00UNMD0E,0.434631
2,01AXPTJF,0.078113
3,01HOEQJW,0.935498
4,01JUKMAO,0.461572
...,...,...
5172,ZXVVNC5O,0.902026
5173,ZYIANVI8,0.134811
5174,ZYITEHAH,0.460181
5175,ZZ3BMBTG,1.000000


In [None]:
# sub.to_csv('fancy_ensemble.csv', index=False)

## 3. Fine-Tune Simple Transformer Models

#### Define Model Parameters

In [None]:
%cd /content

/content


In [None]:
model_type = 'roberta'  
model_name = 'roberta-large'

In [None]:
params = {'n_epochs':8,
          'train_batch_size':16,
          'eval_batch_size':12,
          'max_seq_len':72, 
          'do_lower_case':False,  # set to true if use uncased model
          'learning_rate': 2e-5,
          'gradient_accumulation_steps':4,
          'evaluate_during_training_steps':-1,
          'use_early_stopping':True,
          'early_stopping_metric_minimize': True,
          'early_stopping_delta':0.01,
          'early_stopping_patience':3,
          'seed':128
          }

In [None]:
# Num steps in epoch = num training samples / batch size
steps_per_epoch = int(np.ceil(len(df_train) / float(params['train_batch_size'])))
print('Each epoch will have {:,} steps.'.format(steps_per_epoch))
params['evaluate_during_training_steps'] = int(steps_per_epoch//2)

Each epoch will have 512 steps.


In [None]:
def build_simple_model(model_type, model_name, num_classes=1, params={}):

    # 1 - use dataclass to config classificationmodel
    model_args = ClassificationArgs(
        regression= True,
        train_batch_size = params['train_batch_size'],
        eval_batch_size  = params['eval_batch_size'],
        max_seq_length = params['max_seq_len'],
        do_lower_case = params['do_lower_case'],
        num_train_epochs = params['n_epochs'],
        learning_rate = params['learning_rate'],
        gradient_accumulation_steps = params['gradient_accumulation_steps'],
        evaluate_during_training = True,
        evaluate_during_training_steps = params['evaluate_during_training_steps'],
        use_early_stopping = params['use_early_stopping'],
        early_stopping_metric_minimize = params['early_stopping_metric_minimize'],
        early_stopping_patience = params['early_stopping_patience'],
        early_stopping_delta = params['early_stopping_delta'],
        evaluate_during_training_silent = False,
        evaluate_during_training_verbose = True,
        use_cached_eval_features = True,
        fp16=True,
        reprocess_input_data = True,
        overwrite_output_dir = True,
        manual_seed = params['seed'],
        use_multiprocessing=False,
        use_multiprocessing_for_evaluation=False,
        save_eval_checkpoints = True,
        save_model_every_epoch = True
    )
          
    # unused model args
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # 2 - build ClassificationModel
    print("Build model {} with version {}".format(model_type,model_name))
    model = ClassificationModel(model_type, 
                                model_name, 
                                num_labels=1, 
                                use_cuda=True, 
                                args=model_args)   
    return model

simpletf_model_ft = build_simple_model(model_type, model_name, params=params)

Build model roberta with version roberta-large


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

#### Train Model

In [None]:
simpletf_model_ft.train_model(train_df = df_train,  eval_df = df_val)

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/512 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

(1024,
 defaultdict(list,
             {'global_step': [128,
               256,
               256,
               384,
               512,
               512,
               640,
               768,
               768,
               896,
               1024,
               1024],
              'train_loss': [0.0414583683013916,
               0.05133056640625,
               0.05133056640625,
               0.020051002502441406,
               0.00018901214934885502,
               0.00018901214934885502,
               0.027561187744140625,
               0.0087890625,
               0.0087890625,
               7.1846938226372e-05,
               0.00047744810581207275,
               0.00047744810581207275],
              'eval_loss': [0.29854114627665723,
               0.23134201870507692,
               0.23134201870507692,
               0.2536509667546296,
               0.26049274343935663,
               0.26049274343935663,
               0.223142114748855,
              

#### Inspect generated files

Helper function to print the contents of a directory, with file sizes in MB.

In [None]:
def list_files_info(data_dir):
    '''
    Prints out the files in a directory along with their sizes in MB.
    '''
    # Check out the sizes on the saved files.
    files = list(os.listdir(data_dir))
    print(data_dir)
    rows = []

    # For each file in the directory...
    for f in files:
        # Get the file size, in MB
        f_size = float(os.stat(data_dir + '/' + f).st_size) / 2**20
        # Print the filename and its size.
        print("     {:25s}    {:>8.2f} MB".format(f, f_size))
        rows.append([f, '{:.2f} MB'.format(f_size)])
    print('')

    return pd.DataFrame(rows, columns=['File', 'Size'])


This cache folder stores the tokenized and encoded text data.

In [None]:
list_files_info('./cache_dir')

./cache_dir
     cached_dev_roberta_72_1_2        4.64 MB
     cached_train_roberta_72_1_2       26.27 MB



Unnamed: 0,File,Size
0,cached_dev_roberta_72_1_2,4.64 MB
1,cached_train_roberta_72_1_2,26.27 MB


The `outputs` folder contains the final model, plus all of the checkpoints.

In [None]:
list_files_info('./outputs/')

./outputs/
     training_progress_scores.csv        0.00 MB
     special_tokens_map.json          0.00 MB
     checkpoint-640-epoch-5           0.00 MB
     training_args.bin                0.00 MB
     tokenizer_config.json            0.00 MB
     checkpoint-384-epoch-3           0.00 MB
     checkpoint-256                   0.00 MB
     checkpoint-896-epoch-7           0.00 MB
     vocab.json                       0.76 MB
     checkpoint-768-epoch-6           0.00 MB
     checkpoint-128-epoch-1           0.00 MB
     tokenizer.json                   2.01 MB
     pytorch_model.bin             1355.73 MB
     config.json                      0.00 MB
     best_model                       0.00 MB
     eval_results.txt                 0.00 MB
     checkpoint-512                   0.00 MB
     checkpoint-1024-epoch-8          0.00 MB
     checkpoint-768                   0.00 MB
     merges.txt                       0.44 MB
     checkpoint-256-epoch-2           0.00 MB
     checkpoint-512-

Unnamed: 0,File,Size
0,training_progress_scores.csv,0.00 MB
1,special_tokens_map.json,0.00 MB
2,checkpoint-640-epoch-5,0.00 MB
3,training_args.bin,0.00 MB
4,tokenizer_config.json,0.00 MB
5,checkpoint-384-epoch-3,0.00 MB
6,checkpoint-256,0.00 MB
7,checkpoint-896-epoch-7,0.00 MB
8,vocab.json,0.76 MB
9,checkpoint-768-epoch-6,0.00 MB


### Evaluate on Test Set Using The Best Model


Let's load the model from the checkpoint which performed best on the validation set. 

This is how we combat overfitting--the "final" model (at the end of all training epochs) will perform best on the training set, but may not generalize as well to new data. 

So, instead, we use an earlier checkpoint where the training loss was higher but the validation loss was at its lowest! 

In [None]:
model = ClassificationModel(
    "roberta", "outputs/checkpoint-640-epoch-5"
)

**On Validation Set**

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_val)

Running Evaluation:   0%|          | 0/121 [00:00<?, ?it/s]

In [None]:
val_preds_clipped = np.clip(model_outputs, -1, 1)
print(f"MSE on validation set: {result}")
print(f"RMSE on validation set: {rmse(df_val['label'], val_preds_clipped)}")

MSE on validation set: {'eval_loss': 0.223142114748855}
RMSE on validation set: 0.4710265177689367


**Run on Competition Test Text**

In [None]:
test_data_list = list(test_clean['safe_text'].values)
test_predictions_ft, test_outputs_ft = model.predict(test_data_list)
test_preds_cont_ft = np.clip(test_predictions_ft, -1, 1)
df_test_preds_clipped_ft = pd.DataFrame({'tweet_id':test_clean.tweet_id,'predicted_values':test_preds_cont_ft})

  0%|          | 0/121 [00:00<?, ?it/s]

In [None]:
# audit - make sure total records in test match with sample submission
print(len(df_test_preds_clipped_ft))
print(len(sample_sub))
df_test_preds_clipped_ft.head()
df_test_preds_clipped_ft['predicted_values'].describe()

5177
5177


Unnamed: 0,tweet_id,predicted_values
0,00BHHHP1,-0.744141
1,00UNMD0E,-0.370361
2,01AXPTJF,0.927734
3,01HOEQJW,0.004223
4,01JUKMAO,0.672852


count     5.173000e+03
mean      1.335819e-01
std       3.987433e-01
min      -1.000000e+00
25%       0.000000e+00
50%      1.040839e-310
75%       3.496170e-03
max       1.000000e+00
Name: predicted_values, dtype: float64

In [None]:
# customize name
data_desc = 'clean_final'
model_desc = 'epochs' + str(params['n_epochs'])+'_'+'train_batch' + str(params['train_batch_size']) + '_' + 'lr' + str(params['learning_rate']) + '_' + 'seed' + str(seed) + 'ft'
desc = data_desc + model_desc +'rb_ft'
print(desc)

# output results
df_test_preds_clipped_ft.to_csv(desc+'.csv',index=False)

clean_finalepochs8_train_batch16_lr2e-05_seed1024ftrb_ft
