In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [2]:
%xmode Plain
%load_ext memory_profiler

Exception reporting mode: Plain


In [3]:
data = load_files('categories_articles', encoding="utf-8", decode_error="replace")

In [4]:
labels, counts = np.unique(data.target, return_counts=True)

In [5]:
labels, counts

(array([0, 1, 2, 3]), array([5212, 3385, 4492, 6485], dtype=int64))

In [6]:
labels_sort = np.array(data.target_names)[labels]
dict(zip(labels_sort, counts))

{'agricult': 5212, 'crypto': 3385, 'energy': 4492, 'metals': 6485}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=4)
list(t[:80] for t in X_train[:10])

['Vestas secures Senegal debut \r\n03092018\r\n\r\n1256\r\nWind\r\r\nVestas has won an engine',
 'This Morning in Metals Trump Administration Close to Decision on China Trade Pol',
 'Does Scalia�s Death Help the EPA Clean Power PlanThe final official act of Senio',
 'French farmers fret over subsidies in post-Brexit EU budget talks4 Min ReadLIZIN',
 'Line Announces a New Blockchain Investment Fund, Lists TRXLine, the Tokyo-based ',
 'ICSA calls for �comprehensive research� on TB in deerThe Irish Cattle and Sheep ',
 'This Morning in Metals Copper Rises on China�s Stimulus AnnouncementCharles/Adob',
 'Daily (28.10.2019) NBP spot gas price jumped by over 8% on Friday on forecasts f',
 'McConnell No TPP Vote This Year; Companies Still Can�t Track Conflict MineralsTh',
 'Ex-Kraken Trading Desk Manager Files Suit Against ExchangeA former Kraken employ']

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score

In [9]:
sgd = Pipeline([
        ("count vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
sgd_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
 
svc = Pipeline([
        ("count_vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
svc_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
   
all_models = [
    ("sgd", sgd),
    ("sgd_tfidf", sgd_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ]

In [9]:
svc_tfidf_3500 = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3500)),
        ("linear svc", SVC(kernel="linear"))
    ], memory='svc_tfidf_3500', verbose=True)

svc_tfidf_4500 = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=4500)),
        ("linear svc", SVC(kernel="linear"))
    ], memory='svc_tfidf_4500', verbose=True)

all_models = [
    ("svc_tfidf_3500", svc_tfidf_3500),
    ("svc_tfidf_4500", svc_tfidf_4500)
    ]

In [10]:
unsorted_scores = [(name, cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1).mean()) for name, model in all_models]

In [11]:
scores = sorted(unsorted_scores, key=lambda x: -x[1])
scores

[('svc_tfidf_4500', 0.9889790076081775),
 ('svc_tfidf_3500', 0.9883950926322911)]

##### model

In [89]:
model = svc
model.fit(X_train, y_train);

In [90]:
y_pred = model.predict(X_test)

In [91]:
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       142
           1       1.00      1.00      1.00       166
           2       1.00      0.99      0.99       147
           3       0.99      1.00      0.99       145

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



0.9966666666666667

#### all articles 

0.988 svc_tfidf cv7 vec3000

0.989 svc_tfidf cv5 vec4000

0.989 svc_tfidf cv5 vec4500

#### 1000 per cat

0.9925 svc_tfidf cv5 vec3000

#### 500 per cat

0.996 svc cv5 vec3000

## RandomForest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [34]:
data = load_files('categories_articles', encoding="utf-8", decode_error="replace")

FileNotFoundError: [WinError 3] Системе не удается найти указанный путь: 'data/StockMoon_categories'

In [11]:
text_train, text_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=4)

In [12]:
len(text_train), len(text_test)

(13701, 5873)

In [13]:
cv = CountVectorizer(stop_words='english', max_features=3000)
cv.fit(text_train)
X_train = cv.transform(text_train)
X_test = cv.transform(text_test)

In [14]:
text_train

['Vestas secures Senegal debut \r\n03092018\r\n\r\n1256\r\nWind\r\r\nVestas has won an engineering, procurement and construction contract for Lekela\'s 159MW Taiba N\'Diaye wind farm in Senegal, marking the Danish manufacturer\'s debut deal in the African country.The agreement covers supply, transport, installation and commissioning of 46 V126-3.45MW turbines.\r\n \r\nDelivery is planned for the second and the third quarters of 2019, with commissioning in the third quarter of that year and first quarter of 2020.\r\n \r\nA active output management 5000 service agreement for the operation and maintenance of the wind farm over the next 20 years is also included in the contract.\r\n \r\nThe Danish export credit agency EKF is supporting the project with a EUR140m loan.\r\n \r\nLekela chief operating officer Chris Ford said: "This is a major milestone for Senegal, and for Lekela. As the first utility-scale wind power project in the country, Taiba N\'Diaye forms a critical component of Senega

In [15]:
list(t[:120] for t in text_train[:10])

['Vestas secures Senegal debut \r\n03092018\r\n\r\n1256\r\nWind\r\r\nVestas has won an engineering, procurement and construction cont',
 'This Morning in Metals Trump Administration Close to Decision on China Trade Policykropic/Adobe StockThis morning in met',
 'Does Scalia�s Death Help the EPA Clean Power PlanThe final official act of Senior Associate Supreme Court Justice Antoni',
 'French farmers fret over subsidies in post-Brexit EU budget talks4 Min ReadLIZINES, France, Feb 18 (Reuters) - French da',
 "Line Announces a New Blockchain Investment Fund, Lists TRXLine, the Tokyo-based company behind Japan's most popular mess",
 "ICSA calls for �comprehensive research� on TB in deerThe Irish Cattle and Sheep Farmers' Association (ICSA) has accused ",
 'This Morning in Metals Copper Rises on China�s Stimulus AnnouncementCharles/Adobe StockThis morning in metals news, the ',
 'Daily (28.10.2019) NBP spot gas price jumped by over 8% on Friday on forecasts for colder weather \r\n28102019\r

#### params

In [25]:
forest = RandomForestClassifier(n_jobs=-1, random_state=17)
forest_params = {'n_estimators': np.arange(198, 203, 2)}

grid_forest = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1)

In [26]:
grid_forest.fit(X_train, y_train);

In [27]:
grid_forest.best_params_, grid_forest.best_score_

({'n_estimators': 200}, 0.9900006391188683)

### Model

In [16]:
best_forest = RandomForestClassifier(n_estimators=176, n_jobs=-1, random_state=4)
best_forest.fit(X_train, y_train);

In [17]:
y_pred = best_forest.predict(X_test)
y_pred[:, np.newaxis]

array([[3],
       [0],
       [0],
       ...,
       [1],
       [0],
       [3]])

In [18]:
y_pred_prob = best_forest.predict_proba(X_test)
y_pred_prob

array([[0.        , 0.        , 0.03409091, 0.96590909],
       [1.        , 0.        , 0.        , 0.        ],
       [0.99431818, 0.00568182, 0.        , 0.        ],
       ...,
       [0.00568182, 0.97727273, 0.        , 0.01704545],
       [0.99431818, 0.        , 0.00568182, 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [19]:
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1600
           1       1.00      0.99      1.00       977
           2       0.99      0.99      0.99      1330
           3       0.99      0.99      0.99      1966

    accuracy                           0.99      5873
   macro avg       0.99      0.99      0.99      5873
weighted avg       0.99      0.99      0.99      5873



0.9930189000510812

### Saving

In [20]:
import joblib

In [21]:
joblib.dump(best_forest, 'economy_classifier/finance/finance_classifier_RF.sav')
joblib.dump(cv, r'economy_classifier/finance/finance_classifier_CV.sav')

['economy_classifier/finance/finance_classifier_CV.sav']

In [23]:
loaded_model = joblib.load('economy_classifier/finance/finance_classifier_RF.sav')
y_pred_loaded = loaded_model.predict(X_test)

In [24]:
accuracy_score(y_test, y_pred_loaded)

0.9930189000510812

#### 0.992 RF n176 all in cat 

#### 0.993 RF n176 500 per cat

#### 0 RF n176 all in cat test.9 