In [1]:
# https://fasttext.cc/docs/en/english-vectors.html

In [2]:
#!unzip wiki-news-300d-1M.vec.zip

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_json("filtered_and_field_1.json")
df.head()

Unnamed: 0,DOI,title,abstract,subject,cleaned_abstract,filtered_subject,all_fields,field_Art,field_Biology,field_Business,field_Chemistry,field_Geology,field_Humanities,field_Math,field_Medicine,field_Physics,field_Psychology,field_Social,field_Tech
0,10.1163/1568525043083505,aristotle fr. 44 rose: midas and silenus,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract scholars have identified two supposed...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
1,10.1163/1568525043083532,loca loquuntur. lucretius' explanation of the ...,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract a discussion of the second part of lu...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
2,10.1163/1568525043083541,poverty and demography: the case of the gracch...,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract according to many ancient historians ...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
3,10.1163/1568525043083514,old persian in athens revisited (ar. ach. 100),<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract the old persian line in aristophanes ...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
4,10.1163/1568527053083412,religion and violence: what can sociology offer?,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Religious studies, History]",abstract this essay presents a sketch of a soc...,"[History, Religious studies]","[Humanities, Humanities]",0,0,0,0,0,1,0,0,0,0,0,0


In [76]:
from sklearn.model_selection import train_test_split

target = [column for column in df.columns if "field_" in column]
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_abstract"], df[target],
                                                   train_size=0.8, test_size=0.2,
                                                   random_state=42)

In [77]:
X_train.shape

(1694750,)

# Pretrained fasttext as w2v

In [None]:
from gensim.models import KeyedVectors

fasttext = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False)

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
def transform(sentence, embeddings):
    tokens = tokenizer.tokenize(sentence)
    curr_embs = []
    for token in tokens:
        if token.lower() in embeddings:
            curr_embs.append(embeddings[token.lower()])
    res = np.mean(curr_embs, axis=0)
    return res

def array_transform(array, embeddings):
    new_array = np.empty(shape=(300,))
    for elem in array:
        new_elem = transform(elem, embeddings=embeddings)
        new_array = np.vstack([new_array, new_elem])
    return new_array

In [None]:
X_train_transformed = array_transform(X_train, embeddings=fasttext)
X_test_transformed = array_transform(X_test, embeddings=fasttext)

In [None]:
y_train_classes = {
    field: y_train[field] for field in target
}

In [None]:
from sklearn.linear_model import LogisticRegression

logits = {
    field: LogisticRegression(n_jobs=-1, max_iter=1_000_000, random_state=123) for field in target
}

for field, logit in logits.items():
    print(f"Fitting {field} logistic regression")
    logit.fit(X_train_transformed, y_train_classes[field])

In [None]:
from sklearn.metrics import classification_report as clf_report

for field in target:
    print(field.upper())
    y_pred = logits[field].predict(X_test_transformed)
    print(clf_report(y_test[field], y_pred))
    print()

# Fasttext с нуля

In [97]:
import gensim

gensim.__version__

'3.8.3'

In [98]:
from gensim.models import FastText

model = FastText(sentences=X_train, size=300, min_count=50, iter=10)

In [99]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
def transform(sentence, model):
    tokens = tokenizer.tokenize(sentence)
    curr_embs = []
    for token in tokens:
        curr_embs.append(model.wv[token.lower()])
    res = np.mean(curr_embs, axis=0)
    return res

def array_transform(array, model):
    new_array = np.empty(shape=(300,))
    for elem in array:
        new_elem = transform(elem, model=model)
        new_array = np.vstack([new_array, new_elem])
    return new_array[1:]

In [100]:
%%time

X_test_transformed = X_test.apply(transform, model=model)

CPU times: user 36min 23s, sys: 1.29 s, total: 36min 25s
Wall time: 36min 41s


In [101]:
%%time

X_train_transformed = X_train.apply(transform, model=model)

CPU times: user 2h 25min 19s, sys: 2.31 s, total: 2h 25min 21s
Wall time: 2h 25min 34s


In [102]:
#X_train_transformed = array_transform(X_train, model=model)
#X_test_transformed = array_transform(X_test, model=model)

In [103]:
X_train_transformed = np.stack(X_train_transformed)
X_test_transformed = np.stack(X_test_transformed)

In [104]:
y_train_classes = {
    field: y_train[field] for field in target
}

In [105]:
from sklearn.linear_model import LogisticRegression

logits = {
    field: LogisticRegression(n_jobs=-1, max_iter=1_000_000, random_state=123) for field in target
}

for field, logit in logits.items():
    print(f"Fitting {field} logistic regression")
    logit.fit(X_train_transformed, y_train_classes[field])

Fitting field_Art logistic regression
Fitting field_Biology logistic regression
Fitting field_Business logistic regression
Fitting field_Chemistry logistic regression
Fitting field_Geology logistic regression
Fitting field_Humanities logistic regression
Fitting field_Math logistic regression
Fitting field_Medicine logistic regression
Fitting field_Physics logistic regression
Fitting field_Psychology logistic regression
Fitting field_Social logistic regression
Fitting field_Tech logistic regression


In [106]:
from sklearn.metrics import classification_report as clf_report

for field in target:
    print(field.upper())
    y_pred = logits[field].predict(X_test_transformed)
    print(clf_report(y_test[field], y_pred))
    print()

FIELD_ART
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    419220
           1       0.04      0.00      0.00      4468

    accuracy                           0.99    423688
   macro avg       0.52      0.50      0.50    423688
weighted avg       0.98      0.99      0.98    423688


FIELD_BIOLOGY
              precision    recall  f1-score   support

           0       0.69      0.98      0.81    290398
           1       0.54      0.05      0.08    133290

    accuracy                           0.69    423688
   macro avg       0.61      0.51      0.45    423688
weighted avg       0.64      0.69      0.58    423688


FIELD_BUSINESS
              precision    recall  f1-score   support

           0       0.97      1.00      0.99    411821
           1       0.00      0.00      0.00     11867

    accuracy                           0.97    423688
   macro avg       0.49      0.50      0.49    423688
weighted avg       0.94      0.97