In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torchvision

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC
import torch
import transformers as ppb


In [4]:
df = pd.read_csv("../data/blogtext.csv")

In [5]:
sample = df[:10000]

In [6]:
signs = {"Aries":1, "Taurus":2, "Gemini":3, "Cancer":4, "Leo":5, "Virgo":6, "Libra":7, "Scorpio":8, "Sagittarius":9, "Capricorn":10, "Aquarius":11, "Pisces":12}

In [7]:
sample['sign'] = sample['sign'].map(signs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['sign'] = sample['sign'].map(signs)


In [8]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
tokenized = sample["text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=100, truncation=True)))

In [10]:
maxSeq = max(tokenized, key= lambda i: len(i))
maxLen = len(maxSeq)

padded = np.array([i + [0]*(maxLen - len(i)) for i in tokenized.values])

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(10000, 100)

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = sample['sign']
labels

## Train/Test Split

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
ovr_clf = OneVsRestClassifier(SVC())
ovr_clf.fit(train_features, train_labels)

In [None]:
ovr_clf.score(test_features, test_labels)

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Stop here

# Steps for data cleaning for feeding into the BERT model:
- Tokenization: break the sentence down into tokens
- Add the [CLS] token to the beginning of the sentence
- Add the [SEP] token to the end of the sentence
- Pad the sentence with [PAD] tokens until the length is equal to the maximum length
- Convert each token to their corresponding IDs in the model

In [10]:
tz = ppb.BertTokenizer.from_pretrained("bert-base-cased")

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 1.09MB/s]
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 14.5kB/s]
Downloading: 100%|██████████| 436k/436k [00:00<00:00, 1.09MB/s]


In [13]:
txt = sample["text"][0]

In [26]:
tokens = tz.tokenize(txt)
tokens.insert(0, "[CLS]")

In [30]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the 'C:\Users\user\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [31]:
from nltk import tokenize

In [34]:
import nltk

In [35]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [32]:
tokenize.sent_tokenize(txt)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\user/nltk_data'
    - 'C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38\\nltk_data'
    - 'C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38\\share\\nltk_data'
    - 'C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38\\lib\\nltk_data'
    - 'C:\\Users\\user\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [17]:
tz.convert_tokens_to_ids(tz.tokenize(txt))

[1130,
 14467,
 1144,
 1151,
 1276,
 113,
 116,
 120,
 118,
 1620,
 5097,
 117,
 1105,
 125,
 119,
 126,
 19443,
 1104,
 119,
 185,
 1181,
 2087,
 7004,
 114,
 1986,
 178,
 1138,
 1106,
 3074,
 1235,
 1233,
 1412,
 1264,
 2301,
 1144,
 14659,
 1122,
 1105,
 10123,
 28066,
 119]

In [20]:
len(txt.split())

28

In [19]:
len(tz.tokenize(txt))

41