# Transformer Model Experiment

In [1]:
from sentiment.dataset.load_dataset import LoadDataset
from sentiment.dataset.tokenizer import SimpleTokenizer
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

simple_tokenizer = SimpleTokenizer()

dataset = LoadDataset(
        database_name="PLP",
        collection_name="AStarCOVID",
        n_rows="max",
        tokenizer=simple_tokenizer,
        column_name="Text"
)
dataset

NOTICE: sentiment log file will be at /Users/johnnylu/tweet_sentiment/sentiment/logs/sentiment.log


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnnylu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnnylu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/johnnylu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2022-10-18 10:01:03,270 : INFO : Initialized Mongo Connection to db:PLP, collection: AStarCOVID



        Database: Database(MongoClient(host=['johdev.asuscomm.com:9981'], document_class=dict, tz_aware=False, connect=True), 'PLP'),
        Collection: Collection(Database(MongoClient(host=['johdev.asuscomm.com:9981'], document_class=dict, tz_aware=False, connect=True), 'PLP'), 'AStarCOVID')
        Length : 161390
        Sample: {'Text': 'After 2,964 corona positive cases and 60 casualties in a day, the '
         'PTI govt decides to ease the lockdown. Clearly, the govt has no plan '
         '&amp; supposeldy expects ppl to develop “herd immunity.” So, for the '
         'love of God, exercise social distancing and stay safe.\n'
         '\n'
         'https://t.co/kd9LjqWF1C',
 '_id': ObjectId('634638b37380598a236433a2'),
 'anger_intensity': 0.498,
 'country_region': 'Singapore',
 'date_stamp': '2020-06-02 00:00:00',
 'emotion_category': 'anger',
 'fear_intensity': 0.472,
 'joy_intensity': 0.266,
 'keyword_used': 'covid',
 'sadness_intensity': 0.43,
 'sentiment_category': 'nega

In [2]:
df = dataset.to_pandas()

2022-10-18 10:14:05,723 : INFO : Returning Pandas DataFrame with maximum row: 161390
100%|██████████| 161390/161390 [01:11<00:00, 2249.01it/s]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161390 entries, 0 to 161389
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   _id                 161390 non-null  object 
 1   tweet_ID            161390 non-null  int64  
 2   user_ID             161390 non-null  int64  
 3   t1                  161390 non-null  int64  
 4   t2                  161390 non-null  int64  
 5   t3                  161390 non-null  int64  
 6   t4                  161390 non-null  int64  
 7   t5                  161390 non-null  int64  
 8   t6                  161390 non-null  int64  
 9   t7                  161390 non-null  int64  
 10  t8                  161390 non-null  int64  
 11  t9                  161390 non-null  int64  
 12  t10                 161390 non-null  int64  
 13  valence_intensity   161390 non-null  float64
 14  anger_intensity     161390 non-null  float64
 15  fear_intensity      161390 non-nul

In [4]:
df.head()

Unnamed: 0,_id,tweet_ID,user_ID,t1,t2,t3,t4,t5,t6,t7,...,fear_intensity,sadness_intensity,joy_intensity,sentiment_category,emotion_category,keyword_used,country_region,date_stamp,Text,token
0,634637137380598a236355ae,1245550415581716481,37874853,1,0,0,0,0,0,0,...,0.49,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
1,634637137380598a236355af,1245550321511718912,44290654,1,0,0,0,0,0,0,...,0.49,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
2,634637137380598a236355b0,1245550270190419969,115624161,1,1,1,1,0,0,0,...,0.512,0.446,0.162,negative,fear,covid,Singapore,2020-04-02 00:00:00,Quarantine stress baking? 😆\n\nhttps://t.co/zH...,"[quarantine, stress, baking, 😆, URL]"
3,634637137380598a236355b1,1245550206457954305,20155794,1,0,0,0,0,0,0,...,0.423,0.34,0.319,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,Every vaccine and treatment in development for...,"[every, vaccine, treatment, development, covid..."
4,634637137380598a236355b2,1245548702233583618,35202527,1,1,1,1,0,0,0,...,0.348,0.425,0.255,negative,sadness,covid,Singapore,2020-04-02 00:00:00,this was the second read.. \nhttps://t.co/wYID...,"[wa, second, read, .., URL]"


In [5]:
df = df[["sentiment_category", "Text", "token"]].copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161390 entries, 0 to 161389
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sentiment_category  161390 non-null  object
 1   Text                161390 non-null  object
 2   token               161390 non-null  object
dtypes: object(3)
memory usage: 3.7+ MB


In [7]:
from datasets import Dataset

hg_dataset = Dataset.from_pandas(df)
hg_dataset

Dataset({
    features: ['sentiment_category', 'Text', 'token'],
    num_rows: 161390
})

In [8]:
hg_dataset = hg_dataset.train_test_split(test_size=0.2)

In [9]:
hg_dataset

DatasetDict({
    train: Dataset({
        features: ['sentiment_category', 'Text', 'token'],
        num_rows: 129112
    })
    test: Dataset({
        features: ['sentiment_category', 'Text', 'token'],
        num_rows: 32278
    })
})

In [10]:
hg_dataset = hg_dataset.rename_column(original_column_name="sentiment_category", new_column_name="label")

In [11]:
hg_dataset["train"][0]

{'label': 'positive',
 'Text': '"New Zealand records first COVID-19 cases in 102 days"  https://t.co/Z4PHbals1y',
 'token': ['new',
  'zealand',
  'record',
  'first',
  'covid',
  '19',
  'case',
  '102',
  'day',
  'URL']}

In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
tokenizer.is_fast

True

In [14]:
inputs = tokenizer(hg_dataset["train"][0]["token"], is_split_into_words=True)
inputs

{'input_ids': [101, 2047, 3414, 2501, 2034, 2522, 17258, 2539, 2553, 9402, 2154, 24471, 2140, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
inputs.tokens()

['[CLS]',
 'new',
 'zealand',
 'record',
 'first',
 'co',
 '##vid',
 '19',
 'case',
 '102',
 'day',
 'ur',
 '##l',
 '[SEP]']

In [16]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 9, None]

In [17]:
def tokenize_function(doc):
    return tokenizer(doc['token'], is_split_into_words=True)

tokenized_dataset = hg_dataset.map(tokenize_function, batched=True)

  0%|          | 0/130 [00:00<?, ?ba/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'Text', 'token', 'input_ids', 'attention_mask'],
        num_rows: 129112
    })
    test: Dataset({
        features: ['label', 'Text', 'token', 'input_ids', 'attention_mask'],
        num_rows: 32278
    })
})

In [19]:
tokenized_dataset['train'][0]

{'label': 'positive',
 'Text': '"New Zealand records first COVID-19 cases in 102 days"  https://t.co/Z4PHbals1y',
 'token': ['new',
  'zealand',
  'record',
  'first',
  'covid',
  '19',
  'case',
  '102',
  'day',
  'URL'],
 'input_ids': [101,
  2047,
  3414,
  2501,
  2034,
  2522,
  17258,
  2539,
  2553,
  9402,
  2154,
  24471,
  2140,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}