In [1]:
import numpy as np
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus

In [2]:
# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward])

In [3]:
sentence = Sentence('The grass is green . And the sky is blue .')
document_embeddings.embed(sentence)
len(sentence.get_embedding())

4196

In [4]:
sentence2 = Sentence('The grass is green.')
document_embeddings.embed(sentence2)
len(sentence2.get_embedding())

4196

In [5]:
real_words = ['king', 'monarch', "queen", "emperor", "empress"]
fake_words = ['worker', 'dweller', 'resident', 'craftswoman', 'craftsman']

In [6]:
data_number = 300
real_data = [{'label': 'real', 'text': x} for x in np.random.choice(real_words, data_number)]
fake_data = [{'label': 'fake', 'text': x} for x in np.random.choice(fake_words, data_number)]
data = real_data + fake_data
np.random.shuffle(data)

In [7]:
frame_data = pd.DataFrame(data)
frame_data.iloc[0:int(len(data)*0.8)].to_csv('./data/train.csv', sep='\t', index = False, header = False)
frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('./data/test.csv', sep='\t', index = False, header = False)
frame_data.iloc[int(len(data)*0.9):].to_csv('./data/dev.csv', sep='\t', index = False, header = False);

In [8]:
column_name_map = {1: "text", 0: "label"}

corpus = CSVClassificationCorpus('data',
                                 column_name_map,
                                 delimiter='\t',
                              test_file='test.csv',
                              dev_file='dev.csv',
                              train_file='train.csv')

label_dict = corpus.make_label_dictionary()

word_embeddings = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]

document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            hidden_size=512,
                                            reproject_words=True,
                                            reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings,
                            label_dictionary=label_dict)

trainer = ModelTrainer(classifier, corpus)

trainer.train('model', max_epochs=1)

2019-11-23 22:21:35,231 Reading data from data
2019-11-23 22:21:35,232 Train: data/train.csv
2019-11-23 22:21:35,233 Dev: data/dev.csv
2019-11-23 22:21:35,234 Test: data/test.csv
2019-11-23 22:21:35,237 Computing label dictionary. Progress:


100%|██████████| 480/480 [00:00<00:00, 2543.48it/s]

2019-11-23 22:21:35,573 [b'fake', b'real']





2019-11-23 22:21:37,418 ----------------------------------------------------------------------------------------------------
2019-11-23 22:21:37,421 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-23 22:21:50,794 ----------------------------------------------------------------------------------------------------
2019-11-23 22:21:50,807 Testing using best model ...
2019-11-23 22:21:50,808 loading file model/best-model.pt
2019-11-23 22:21:53,561 1.0	1.0	1.0
2019-11-23 22:21:53,563 
MICRO_AVG: acc 1.0 - f1-score 1.0
MACRO_AVG: acc 1.0 - f1-score 1.0
fake       tp: 32 - fp: 0 - fn: 0 - tn: 28 - precision: 1.0000 - recall: 1.0000 - accuracy: 1.0000 - f1-score: 1.0000
real       tp: 28 - fp: 0 - fn: 0 - tn: 32 - precision: 1.0000 - recall: 1.0000 - accuracy: 1.0000 - f1-score: 1.0000
2019-11-23 22:21:53,564 ----------------------------------------------------------------------------------------------------


{'test_score': 1.0,
 'dev_score_history': [1.0],
 'train_loss_history': [0.3224512050549189],
 'dev_loss_history': [tensor(0.0717)]}

In [9]:
classifier = TextClassifier.load('./model/best-model.pt')

2019-11-23 22:21:53,618 loading file ./model/best-model.pt


In [10]:
def predict(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    return sentence.labels

In [11]:
predict("king")

[real (0.9592270255088806)]

In [12]:
for w in real_words:
    print(w, predict(w))
    
print()

for w in fake_words:
    print(w, predict(w))

king [real (0.9592270255088806)]
monarch [real (0.9225751757621765)]
queen [real (0.9279063940048218)]
emperor [real (0.9752405881881714)]
empress [real (0.9544050693511963)]

worker [fake (0.9427129626274109)]
dweller [fake (0.9414699077606201)]
resident [fake (0.9087944626808167)]
craftswoman [fake (0.8037753701210022)]
craftsman [fake (0.9538108110427856)]


In [13]:
predict("prince")

[real (0.8956785202026367)]

In [14]:
predict("hammer")

[fake (0.6483678817749023)]

In [15]:
predict("crown")

[real (0.8655360341072083)]

In [16]:
predict("paper")

[fake (0.7046210169792175)]

In [17]:
predict("kingdom")

[real (0.8526270389556885)]

In [18]:
predict("lord")

[real (0.7863451242446899)]

In [19]:
predict("farmer")

[fake (0.8844714164733887)]

In [20]:
predict("palace")

[real (0.7795140743255615)]