# Outliers

In [1]:
from synth.utils import Config, Context
import yaml
from sqlalchemy.orm import sessionmaker

with open('../config.yml', 'r') as f:
    config = Config(**yaml.safe_load(f))

context = Context(config)
session = sessionmaker(bind=context.target_engine)()

In [2]:
import pandas as pd

df = pd.read_sql('Output', session.connection(), index_col='id')
df = df.loc[:,['title']]
df

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
1,Molecular phylogeny within true bugs (Hemipter...
2,Gene-flow solid frozen - the roles of intrinsi...
3,Age and rate of speciation in the adaptive rad...
4,Did glacial advances during the Pleistocene in...
5,Contribution to the Pupae of the Western Palea...
...,...
12276,
12277,New species of scaly tree ferns (Cyatheaceae) ...
12278,Slowly but surely: gradual diversification and...
12279,New Guinea has the world’s richest island flora


In [3]:
import pandas as pd
from nltk.stem.porter import PorterStemmer
import spacy
import re

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = PorterStemmer()

no_punct_rgx = re.compile(r'[^a-z- ]')
en_em_dash_rgx = re.compile(r'\s-\s')

def get_tokens(txt):
    if txt is None:
        return ''
    txt = no_punct_rgx.sub(' ', txt.lower())
    txt = en_em_dash_rgx.sub(' ', txt)
    doc = nlp(txt)
    tokens = [stemmer.stem(token.text) for token in doc if
              token.pos_ in ['NOUN', 'ADJ'] and len(token.lemma_) > 1]
    return ' '.join(tokens)

df['tokenised_title'] = df.title.apply(get_tokens)
df

Unnamed: 0_level_0,title,tokenised_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Molecular phylogeny within true bugs (Hemipter...,true bug
2,Gene-flow solid frozen - the roles of intrinsi...,gene flow solid role intrins extrins factor mi...
3,Age and rate of speciation in the adaptive rad...,age rate speciat adapt radiat antarct fish
4,Did glacial advances during the Pleistocene in...,glacial advanc pleistocen influenc demograph h...
5,Contribution to the Pupae of the Western Palea...,contribut pupa western palearct moth noctuoidea
...,...,...
12276,,
12277,New species of scaly tree ferns (Cyatheaceae) ...,new speci new combin famili
12278,Slowly but surely: gradual diversification and...,gradual diversif phenotyp evolut hyper divers ...
12279,New Guinea has the world’s richest island flora,world richest island flora


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.svm import SVC

with open('vectoriser.pkl', 'rb') as f:
    vectoriser = pickle.load(f)
    
with open('svc.model', 'rb') as f:
    svc_classifier = pickle.load(f)
    
transformed_titles = vectoriser.transform(df.tokenised_title)
categories = svc_classifier.predict(transformed_titles)
probabilities = svc_classifier.predict_proba(transformed_titles).max(axis=1)

df['category'] = categories
df['probability'] = probabilities

df

Unnamed: 0_level_0,title,tokenised_title,category,probability
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Molecular phylogeny within true bugs (Hemipter...,true bug,life,0.716677
2,Gene-flow solid frozen - the roles of intrinsi...,gene flow solid role intrins extrins factor mi...,life,0.511378
3,Age and rate of speciation in the adaptive rad...,age rate speciat adapt radiat antarct fish,life,0.942029
4,Did glacial advances during the Pleistocene in...,glacial advanc pleistocen influenc demograph h...,life,0.755831
5,Contribution to the Pupae of the Western Palea...,contribut pupa western palearct moth noctuoidea,life,0.880933
...,...,...,...,...
12276,,,life,0.938151
12277,New species of scaly tree ferns (Cyatheaceae) ...,new speci new combin famili,life,0.961715
12278,Slowly but surely: gradual diversification and...,gradual diversif phenotyp evolut hyper divers ...,life,0.988476
12279,New Guinea has the world’s richest island flora,world richest island flora,life,0.963082


In [5]:
from scipy import stats
import numpy as np

df['z_score'] = stats.zscore(df.probability)
threshold = -2.5

outliers = df[df.z_score < threshold]
outliers.sort_values('probability')

Unnamed: 0_level_0,title,tokenised_title,category,probability,z_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7396,Actividad ecológica de productos naturales mar...,ecolog situ potenci project,life,0.500000,-3.146491
1085,Spectral properties of pierid butterflies,spectral properti pierid butterfli,life,0.500000,-3.146491
8650,Insular species swarm goes underground: two ne...,new troglobiont cylindroiulu milliped madeira,life,0.500000,-3.146491
8590,Anatomy of a new cetotheriid genus and species...,anatomi new cetotheriid genu speci miocen here...,life,0.500000,-3.146491
7992,Updated taxonomy of recent and fossil European...,taxonomi recent fossil european mammalia calib...,life,0.500000,-3.146491
...,...,...,...,...,...
6672,"«Agriculture, Hunsbandry, Hunting and Fishing»...",agricultur hunt fish neolith civilis then useu...,life,0.576915,-2.507821
8407,Semi-permeable species boundaries in Iberian b...,semi permeabl speci boundari iberian barbel,earth,0.577042,-2.506769
6905,African Ichthyology at the Royal Museum for Ce...,african museum introduct past current research...,life,0.577076,-2.506483
6800,Holocene regional gradients of dust provenance...,region gradient dust proven flux,earth,0.577618,-2.501984


In [6]:
outlier_tokens = outliers.tokenised_title.apply(lambda x: x.split()).explode().value_counts()
outlier_tokens.iloc[:10,]

new          80
speci        75
morpholog    24
evolut       24
fossil       22
descript     20
record       19
systemat     19
studi        19
antarct      18
Name: tokenised_title, dtype: int64