In [102]:
import numpy as np
import joblib
import os

MIN_POSSIBILITY = .35
CURR_PATH = os.path.abspath(os.curdir)
MODELS_PATH = os.path.join(CURR_PATH, 'models')

from node_structure import make_node_structure, Node
ROOT_NODE = make_node_structure(MODELS_PATH)

In [103]:
def article_classify(article: str, node: Node, diff_coef=.1) -> set:
    """
    Gets an article text and Node with pipeline
    Returns the set of string names of categories which the article belongs to
    
    params article: the article that has to be classified
    params node: has downloaded pipeline
    params diff_coef: the difference between the max category result and the others
    """
    
    try:
        y_pred = node.pipeline.predict_proba(article).reshape(-1)
                
        mask = y_pred > (y_pred.max() - diff_coef)
        classes = sorted(list(node.get_children_names())) 
        # other PCs may had loaded sets differently, not in alphabet order, find out how
                
        result = set(np.array(classes)[mask]) if y_pred.max() >= MIN_POSSIBILITY else set()
        
    except Exception as error:
        print('node name: ', node.name) 
        print('error: ', error)
        assert node.pipeline == 'Error! No path exists!', '!!!No such pipeline loaded!!!'
        result = {}
    
    return result

In [104]:
def article_tree_classify(article: list, node=ROOT_NODE, diff_coef=.1) -> dict:
    """
    Classifies an article above all the tree, which you can see in node structure
    Gets Node structure, article, diff_coef, which is metioned above
    Returns a dictionary, which is used rucursively to create categories and 
    subcategories and ... which this article is in
    
    params article:  the text article that has to be classified
    params node: a tree structure that classifies the article
    params diff_coef: the difference between the max category result and the others
    """
    
    if not node.children_set:
        return {}
    
    diction = {}
    result_categories = article_classify(article, node, diff_coef)
    
    while result_categories:
        category = result_categories.pop()
        
        for node in node.children_set:
            
            if category == node.name:
                diction[category] = article_tree_classify(article, node)            
 
    return diction

In [105]:
article_path = r'queries\metals, metals finance, metals, metals news, metals stocks, metals futures, metals shares\3 Gold Stocks to Buy in July.txt'
f = open(article_path, 'r')
f = [f.read()]
print(f)

["While soaking up the golden rays of the summer sun, how about adding some gold-focused stocks to your portfolio?\nA member of The Motley Fool since 2006, Scott began contributing content in 2013. His focus includes renewable energy, gold, and water utilities. Follow him on Twitter. . . Follow @scott81236\nWith the first full month of summer under way, it's not only the days that are growing warmer. The gold market is also heating up. On the first day of July, August gold futures flirted with the $1,800 threshold, crossing it briefly and then falling. Nonetheless, it was the highest price gold futures had reached since August 2011.\nBetween COVID-19 cases rising in several areas throughout the U.S. and geopolitical tensions remaining high, it's clear that investors are motivated to maintain exposure to gold as a hedge against overall market volatility. Investors, therefore, may find themselves motivated to add some gold exposure to their portfolios. And it just so happens that there a

In [95]:
article_path = r'queries\energy, energy, energy finance, energy news, energy stocks, energy futures, energy shares\2 Top Energy Stocks to Buy Now.txt'
f = open(article_path, 'r')
f = [f.read()]
print(f)

['Amy Legate-Wolfe | April 25, 2020 | More on: SU PKI SU\nItвЂ™s been a rough few years for CanadaвЂ™s oil and gas industry. Even before the market crash, energy stocks across the board fell as the oil and gas glut grew, and production around the world continued as normal. As the market crash hit, more bad news came, with Russia announcing it would not cut back oil production, and Saudi Arabia it would even increase its production. Then, a market rally led to some hope.\nBoth countries announced cuts to production, and there are rumours of further cuts on the way. This coupled with a market rally has brought energy stocks up a bit, but there is definitely still room for improvement. But before you buy up any old energy stock, there are a few that should perform better a lot sooner than the rest.\nOne of the top energy stocks analysts recommend is Suncor Energy Inc. (TSX:SU) (NYSE:SU) . This Warren Buffett pick has been sliced in half with the market crash. But with a market rally on th

In [106]:
article_tree_classify(f)

{'metals': {'gold': {}}}

In [84]:
%timeit article_tree_classify(f)

235 ms ± 53.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
%timeit make_node_structure()

137 µs ± 8.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [107]:
%timeit make_node_structure(MODELS_PATH)

23.5 s ± 231 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
