# Tokenization

In [None]:
from flair.data import Sentence
# Make a sentence object by passing an untokenized
#string and the 'use tokenizer' flag
sentence = Sentence ('The grass is green.', use_tokenizer=True)
# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens

# Creating a Sentence

In [None]:
# The sentence objects holds a sentence that we may want to embed or tag
from flair.data import Sentence
# Make a sentence object by passing a whitespace tokenized string
sentence = Sentence ('The grass is green .')
# Print the object to see what's in there
print(sentence)
# using the token id
print(sentence.get token (4))
# using the index itself
print(sentence [3])
for token in sentence:
    print (token)

Output:                   
Sentence: "The grass is green." - 5 Tokens                          
Token: 4 green                         
Token: 4 green                             
Token: 1 The Token: 2 grass                      
Token: 3 is                          
Token: 4 green                 
Token: 5

In [None]:
# The sentence obiects holds a sentence
from flair.data import Sentence
# Make a sentence object by passing a whitespace
sentence = Sentence("انا احب لغة بايثون")
# Print the object to see what's in there
print(sentence)
# using the token id
print (sentence. get_token (4))
# using the index itself
print (sentence [3])
for token in sentence:
    print(token)

Output:            
Sentence: "انا احب لغة بايثون" - 4 Tokens                                       
Token: 4 بايثون                            
Token: 4 بايثون                        
Token: 1 انا                   
Token: 3 احب              
Token: 4 لغة           
Token: 5 بايثون                

# Adding Tags to Tokens

In [None]:
from flair.data import Sentence
# Make a sentence object by passing an untokenized
#string and the 'use tokenizer' flag
sentence = Sentence ('The grass is green.', use_tokenizer=True)
# Print the object to see what's in there
print (sentence)
# add a tag to a word in the sentence
sentence [3].add_tag('ner', 'color')
# print the sentence with all tags of this type
print (sentence.to_tagged_string())

Sentence: "The grass is green ." - 5 Tokens
The grass is green <color>

In [None]:
from flair.data import Sentence
from flair.data import Label
# Make a sentence object by passing an untokenized
#string and the 'use tokenizer' flag
sentence = Sentence('The grass is green.', use_tokenizer=True)
# Print the object to see what's in there
print (sentence)
# add a tag to a word in the sentence
sentence [3].add_tag('ner','color')
# print the sentence with all tags of this type
print(sentence.to tagged_string())
tag: Label = sentence [3]. get_tag('ner')
print (f'" {sentence [3]}" is tagged as "{tag.value}" with confidence score "{tag.score}"')


Sentence: "The grass is green
- 5 Tokens
The grass is green <colors .
"Token: 4 green" is tagged as "color" with confidence score "1.0"

# Adding Labels to Sentences

In [None]:
from flair.data import Sentence
sentence = Sentence ('France is the current world cup winner.')
# add a label to a sentence
sentence.add label('sports')
# a sentence can also belong to multiple classes
sentence.add_labels (['sports', 'world cup'])
# you can also set the labels while initializing the sentence
sentence = Sentence ('France is the current world cup winner.', labels=['sports', 'world cup'])
print (sentence)
for label in sentence.labels:
    print (label)

Sentence: "France is the current world cup winner." - 7 Tokens -                           
Labels: [sports (1.0), world cup (1.0)] sports (1.0) world cup (1.0)

-This indicates that the sentence belongs to these two classes, each with confidence score 1.0.


# Named Entity Recognition (NER)

- What is Named Entity Recognition (NER)?
Named entity recognition (NER) is a sub-task of information extraction (IE) that seeks out and categories specified entities in a body or bodies of texts.                   

The dataset consists of the following tags:
•geo = Geographical Entity                  
•org = Organization                 
•per = Person                     
•gpe = Geopolitical Entity             
•tim = Time indicator                          
•art = Artifact                       
•eve = Event                          
•pat = Natural Phenomenon             

In [None]:
from flair.models import SequenceTagger
from flair.data import Sentence
tagger = SequenceTagger. load('ner')
sentence = Sentence ('George Washington went to Washington .')
# predict NER tags
tagger.predict (sentence)
# print sentence with predicted tags
print(sentence.to_tagged_string())
for entity in sentence.get_spans ('ner'):
    print (entity)
print (sentence.to dict(tag( type='ner'))

George <B-PER> Washington <E-PER> went to Washington <S- LOC>                       
PER-span [1,2]: "George Washington"                                     
LOC-span [5]: "Washington"                              
{'text': 'George Washington went to Washington .',
'labels': [], 'entities': [{'text': 'George
Washington'
"start pos': 0,
'end pos': 17,
"type': 'PER',
'confidence': 0.9967882037162781}, {'text'
'Washington
'start_pos': 26, 'end_pos': 36, 'type': 'LOC', 'confidence': 0.9993709921836853}]}

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
# make a sentence
sentence = Sentence ('I love Berlin .')
# load the NER tagger
tagger = SequenceTagger. load('ner')
# run NER over sentence
tagger.predict (sentence)
print (sentence)
print ('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans ('ner'):
    print (entity)

Sentence: "I love Berlin ." - 4 Tokens                  
The following NER tags are found:                    
LOC-span [3]: "Berlin"               

# Text Classification and prediction

In [None]:
from flair.models import Textclassifier
from flair.data import Sentence
classifier = TextClassifier. load('en-sentiment')
sentence = Sentence ('This film hurts. It is so bad that I am confused.')
# predict NER tags
classifier.predict(sentence)
# print sentence with predicted labels
print (sentence.labels)

[NEGATIVE (1.0)]

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load( 'en-sentiment')
sentence = Sentence ('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print( 'Sentence above is: ', sentence.labels)

Sentence above is: 
[POSITIVE (0.8746314644813538)]

# Word Embedding

In [None]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
# init embedding
glove_embedding = WordEmbeddings ('glove')
# create sentence.
sentence = Sentence ('The grass is green .')
# embed a sentence using glove.
glove_embedding.embed (sentence)
# now check out the embedded tokens.
for token in sentence:
    print(token)
    print (token.embedding)

Token: 1 The
tensor([-0.0382, -0.2449,
0.7281, -0.3996,
-0.5755, 0.0875, 0.2879, -0.0673,
0.3340, -0.3385, -0.3174, -0.4834,
0.4495, -0.4697,
0.0263,
-0.5415,
0.1439, 0.2346, -0.3102, 0.0862,
-0.7179, -0.4153,
0.2033, -0.1276,
-0.3656, -0.5486, -0.0629,
0.0162, -0.0171,
-0.5203, -0.1459, 0.8278, 0.2706])                       
Token: 2 grass tensor([-0.8135,
0.9404, -0.2405, -0.1350,
-0.5478, -0.3537, 0.0734, 0.2587,
0.1950,
0.5346,
0.6166,
0.7424,
0.7284,
0.0578,
-0.3262, -1.3641,
1.0822, -0.2296, 0.6039, 0.5541,
-0.1637, -0.8468, 0.0741, -0.6216,
-0.0161, -0.4972, -0.5534, -0.4037,
0.4928, 0.9488,
0.2040,
2.3436, -0.2207, 8:886, -8.6927.])

# Document Embedding

In [None]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
from flair.data import Sentence
glove_ embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings ([glove_embedding])
# create an example sentence
sentence = Sentence ('The grass is green . And the sky is blue .')
# embed the sentence with our document embedding
document_ embeddings.embed(sentence)
# now check out the embedded sentence.
print (sentence.get_embedding())

tensor([-0.0382, -0.2449, 0.7281, -0.3996, -0.5755, 0.0875, 0.2879, -0.0673, 0.3340, -0.3385, -0.3174, -0.4834, 0.4495, -0.4697, 0.0263, -0.5415, 0.1439, 0.2346, -0.3102, 0.0862, -0.7179, -0.4153, 0.2033, -0.1276, -0.3656, -0.5486, -0.0629, 0.0162, -0.0171, -0.5203, -0.1459, 0.8278, 0.2706])

# Loading Training Data

The Corpus Object:                         
The Corpus represents a dataset that you use to train a model. It consists of a list of train sentences, a list of dev sentences, and a list of test sentences.

In [None]:
import flair.datasets
corpus = flair.datasets.UD_ENGLISH()
# print the number of Sentences in the train split
print (len (corpus.train))
# print the number of Sentences in the test split
print(len (corpus.test))
# print the number of Sentences in the dev split
print(len (corpus .dev))
# print the first Sentence in the training split
print(corpus.test [0])
# print the first Sentence in the training split
print (corpus.test[0].to_tagged_string('pos'))

1. Flair datasets \ud english en ewt-ud-test. conflu                     
2019-07-26 03:02:17,588 removing temp file C: \Users \hhourani \AppData\Local\T                         
2019-07-26 03:02:20,727 https://raw.githubusercontent.com/UniversalDependend train.conllu not found in cache, downloading to C: \Users\hourani\AppData\Lo
13303045B [00:06, 2072067.11B/s]                             
2019-07-26 03:02:27,961 copying C: \Users\hhourani \AppData\Local\Temp\tmp12km
I. flair\datasets \ud_englishlen_ewt-ud-train.conllu
2019-07-26 03:02:27,981 removing temp file C: \Users \hhourani\AppData\Local\T
2019-07-26 03:02:27,985 Reading data from C: \Users \houranil.flair\datasets\
2019-07-26 03:02:27,985 Train: C: \Users\hhourani\.flair\datasets\ud_english\
2019-07-26 03:02:27,985 Test: C: \Users\hhourani\.flair\datasets\ud_englishle
2019-07-26 03:02:27,985 Dev: C: \Users \hhouranil.flair\datasets\ud_english\en
12543
2077
2002
Sentence: "What if Google Morphed Into Googles ?" - 7 Tokens                          
What <W> if <IN> Google <NNP> Morphed <VBD> Into <IN› GoogleS <NNP> ? <.>

# Classify Spam

Using Transfer Learning and Pre-trained Language Models to Classify Spam link:                         
https://heartbeat.fritz.ai/using-transfer-learning-and-pre-trained-language-models-to-classify-spam-549c0f56c20

Downloading SMSSpamCollection.txt link:
http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/

In [None]:
# SMSSpamCollection.txt
ham       Go until jurong point, crazy.
ham       ok lar...Joking wif u oni.                     
spam      Free entry in 2 a wkly comp to           


# Train the model
Dataset
Word Embedding
Document Embedding
Classifier
Train

In [None]:
# Importing Libraries
import pandas as pd
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import Textclassifier
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Sentence

# Loading and Pre-processing the Data
data = pd.read csv('SMSSpamCollection.txt',delimiter='\t',header=None)
data = data.rename (columns={0:"label", 1:"text"}).drop duplicates ()
data['label '] = 'label' + data['label '1.astype(str)
data. iloc [0: int (len (data)*0.8)].to_csv('train.csv', sep='\t',index = False, header = False)
data.iloc[int (len (data)*0.8) :int (len (data)*0.9)].to_csv('test.csv', sep='10',index = False, header = False)
data.iloc[ int(len (data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False)
                                
# Train the model
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file= 'test.csv',dev_file='dev.csv', train_file='train.csv')
word embeddings = [WordEmbeddings ('glove'),FlairEmbeddings ('news-forward-fast'), FlairEmbeddings ('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings (word_embeddings, hidden_size=512,reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier (document_embeddings,label_dictionary=corpus.make_label_dictionary (), multi_label=False)
trainer = ModelTrainer (classifier, corpus)
trainer.train('./', max epochs=10)
                                
#predict
classifier = TextClassifier. load('/best-model.pt')
# sentence = data['text'].tolist()
sent = ["FREE entry into our £250 weekly comp just \
send the word WIN to 80086 NOW. 18 T&C www.txttowin.co.uk"]
sentence = Sentence (sent)
classifier.predict(sentence)
# print (sentence.labels)
label = str (sentence. labels [0]) .split() [0]
print (f"{label}\t{sentence}")

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier. load('./best-model.pt')
# sentence = data['text'].tolist ()
sent = ["FREE entry into our £250 weekly comp just \
send the word WIN to 80086 NOW. 18 T&C www.txttowin.co.uk"]
sentence = Sentence (sent)
classifier.predict(sentence)
# print(sentence. labels)
label = str(sentence.labels[0]).split()[0]
print (f"{label}\t{sentence}")

Output:

2019-09-28 22:05:06,990 loading file ./best-model.pt
C: \Python37New\lib\site-packages\torch\serialization.p:574: DeprecationWarning: Call to deprecated class DocumentLSTMEmbeddings. (The functionality of this class is moved to
'DocumentRNNEmbeddings') -- Deprecated since version 0.4.
result = unpickler. load()
spam
Sentence: "FREE entry into our £250 weekly comp just
send the word WIN to
80086 Now. 18 T&C www.txttowin.co.uk" - 1 Tokens - Labels: [spam (0.8216946125030518)]

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier. load('./best-model.pt')
# sentence = data[ 'text'].tolist)
sent = ["Hi Hussam, How are you?"]
sentence = Sentence(sent)
classifier.predict(sentence)
# print (sentence.labels)
label = str (sentence. labels [0]).split()[0]
print (f"{label}\t{sentence}")

Output: 
    
2019-09-28 22:12:08,972 loading file ./best-model.pt
C: \Python37New\lib\site-packages\torch\serialization.p:574: DeprecationWarning: Call to deprecated class DocumentLSTMEmbeddings. (The functionality of this class is moved to
"DocumentRNNEmbeddings') - - Deprecated since version 0.4.
result = unpickler. load()
ham
Sentence: "Hi Hussam, How are you?" - 1 Tokens - Labels: [ham (0.8704296946525574)]

# Classify Spam using Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn. feature_extraction.text import TfidfVectorizer
from sklearn. linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
#read the data
df = pd.read csv('SMSSpamCollection.txt', delimiter='\t',header=None)
df.rename (columns = {0: 'label',1: 'text'}, inplace = True)
#Input and output variables
X = df['text']
y = df['label']
seed = 5
test_size = 0.33
#split dataset into train and test sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, Y,test_size=test_size,random_state=seed)
#Convert to a matrix of TF-IDF features
vectorizer = TfidfVectorizer ()
x_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
#Model training
classifier = LogisticRegression()
classifier.fit (X_train, y_train)
#prediction
predictions = classifier.predict (X_test)
#model evaluation
score = accuracy_score (y_test, predictions)
f_score = f1_score (_test, predictions, average='micro')
print("The accuracy score (Logistic Regression) is:" , score)
print("The F score-Micro (Logistic Regression) is:" , f_score)

C: \Python37New\lib\site-packages\sklearn\linear model\logistic. solver will be changed to 'Ibfgs' in 0.22. Specify a solver to FutureWarning)
The accuracy score (Logistic Regression) is: 0.9668297988036977                       
The r score-Micro (Logistic Regression) Is: 0.966829798803697

Flair's Score:                                
EPOCH 7 done: loss 0.0427 - Ir 0 1000
DEV : loss 0.04332328587770462 - score 0.9903
BAD EPOCHS (no improvement): 1

# References:

https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_1_BASICS.md

https://github.com/zalandoresearch/flair

https://heartbeat.fritz.ai/using-transfer-learning-and-pre-trained-language-models-to-classify-spam-549c0f56c20

http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/

https://pythonprogramming.net/sentiment-analysis-python-textblob-vader/

https://textblob.readthedocs.io/en/dev/quickstart.html

https://textblob.readthedocs.io/en/dev/advanced_usage.html
