In [41]:
import numpy as np
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus

In [42]:
# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward])

In [43]:
sentence = Sentence('The grass is green . And the sky is blue .')
document_embeddings.embed(sentence)
len(sentence.get_embedding())

4196

In [44]:
sentence2 = Sentence('The grass is green.')
document_embeddings.embed(sentence2)
len(sentence2.get_embedding())

4196

In [45]:
real_words = ['king', 'monarch', "queen", "emperor", "empress"]
fake_words = ['worker', 'dweller', 'resident', 'craftswoman', 'craftsman']

In [46]:
data_number = 300
real_data = [{'label': 'real', 'text': x} for x in np.random.choice(real_words, data_number)]
fake_data = [{'label': 'fake', 'text': x} for x in np.random.choice(fake_words, data_number)]
data = real_data + fake_data
np.random.shuffle(data)

In [47]:
frame_data = pd.DataFrame(data)
frame_data.iloc[0:int(len(data)*0.8)].to_csv('./data/train.csv', sep='\t', index = False, header = False)
frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('./data/test.csv', sep='\t', index = False, header = False)
frame_data.iloc[int(len(data)*0.9):].to_csv('./data/dev.csv', sep='\t', index = False, header = False);

In [51]:
column_name_map = {1: "text", 0: "label"}

corpus = CSVClassificationCorpus('data',
                                 column_name_map,
                                 delimiter='\t',
                              test_file='test.csv',
                              dev_file='dev.csv',
                              train_file='train.csv')

label_dict = corpus.make_label_dictionary()

word_embeddings = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]

document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            hidden_size=512,
                                            reproject_words=True,
                                            reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings,
                            label_dictionary=label_dict)

trainer = ModelTrainer(classifier, corpus)

trainer.train('model', max_epochs=1)

2019-11-23 20:43:24,158 Reading data from data
2019-11-23 20:43:24,160 Train: data/train.csv
2019-11-23 20:43:24,161 Dev: data/dev.csv
2019-11-23 20:43:24,162 Test: data/test.csv
2019-11-23 20:43:24,165 Computing label dictionary. Progress:


100%|██████████| 480/480 [00:00<00:00, 2086.57it/s]

2019-11-23 20:43:24,640 [b'fake', b'real']





2019-11-23 20:43:26,368 ----------------------------------------------------------------------------------------------------
2019-11-23 20:43:26,369 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-23 20:43:40,138 ----------------------------------------------------------------------------------------------------
2019-11-23 20:43:40,139 Testing using best model ...
2019-11-23 20:43:40,140 loading file model/best-model.pt
2019-11-23 20:43:42,264 1.0	1.0	1.0
2019-11-23 20:43:42,267 
MICRO_AVG: acc 1.0 - f1-score 1.0
MACRO_AVG: acc 1.0 - f1-score 1.0
fake       tp: 27 - fp: 0 - fn: 0 - tn: 33 - precision: 1.0000 - recall: 1.0000 - accuracy: 1.0000 - f1-score: 1.0000
real       tp: 33 - fp: 0 - fn: 0 - tn: 27 - precision: 1.0000 - recall: 1.0000 - accuracy: 1.0000 - f1-score: 1.0000
2019-11-23 20:43:42,268 ----------------------------------------------------------------------------------------------------


{'test_score': 1.0,
 'dev_score_history': [1.0],
 'train_loss_history': [0.3398606846729914],
 'dev_loss_history': [tensor(0.1034)]}

In [55]:
classifier = TextClassifier.load('./model/best-model.pt')

2019-11-23 20:43:57,783 loading file ./model/best-model.pt


In [56]:
def predict(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    return sentence.labels

In [57]:
predict("king")

[real (0.9485031962394714)]

In [58]:
for w in real_words:
    print(w, predict(w))
    
print()

for w in fake_words:
    print(w, predict(w))

king [real (0.9485031962394714)]
monarch [real (0.8778084516525269)]
queen [real (0.9221799373626709)]
emperor [real (0.9736762046813965)]
empress [real (0.9492069482803345)]

worker [fake (0.9481209516525269)]
dweller [fake (0.9157224893569946)]
resident [fake (0.8962298035621643)]
craftswoman [fake (0.7751086354255676)]
craftsman [fake (0.9255284667015076)]


In [59]:
predict("prince")

[real (0.8359680771827698)]

In [60]:
predict("hammer")

[fake (0.6282742023468018)]

In [61]:
predict("crown")

[real (0.8554590344429016)]

In [62]:
predict("paper")

[fake (0.769315779209137)]

In [63]:
predict("kingdom")

[real (0.808620274066925)]

In [67]:
predict("lord")

[real (0.8040608763694763)]

In [70]:
predict("farmer")

[fake (0.9035744667053223)]

In [71]:
predict("palace")

[real (0.7915701866149902)]