In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [6]:
%xmode Plain
%load_ext memory_profiler

Exception reporting mode: Plain


In [81]:
data = load_files('data/StockMoon_categories', encoding="utf-8", decode_error="replace")

In [48]:
labels, counts = np.unique(data.target, return_counts=True)

In [50]:
labels, counts

(array([0, 1, 2, 3]), array([5212, 3385, 4492, 6485], dtype=int64))

In [12]:
labels_sort = np.array(data.target_names)[labels]
dict(zip(labels_sort, counts))

{'agricult': 5212, 'crypto': 3385, 'energy': 4492, 'metals': 6485}

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)
list(t[:80] for t in X_train[:10])

['Daily (22.05.2018) WTI futures settled 1.4% to new peak on Monday following geop',
 'Luxcara expands in Scandinavia with 750-MW Swedish wind buy \r\n30042019\r\n\r\n1034\r\n',
 '20 Fossil Fuel Companies Generate One Third Of All Global Emissions \r\n10102019\r\n',
 'Ripple rumor Does Binance soon use XRP and ODL technologyIn recent days, there h',
 'U.S. Attorney General Barr considers quitting over Trump tweets source(Reuters) ',
 'SteelBrazil, where the slabs come fromIT DID not exactly get a welcome in the hi',
 'Ford Motor Co. Puts on Aluminum-Bodied F-150 Show in MichiganFord is visiting 26',
 "Minister to challenge processors as NI farm income freefallsNorthern Ireland's M",
 'Blockchain.com Eying to Raise $50 Million in Venture FundBlockchain.com, a leadi',
 'Stocks scale fresh peaks on slowing virus, dollar gainsNEW YORK (Reuters) - The ']

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score

In [86]:
sgd = Pipeline([
        ("count vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
sgd_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
 
svc = Pipeline([
        ("count_vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
svc_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
   
all_models = [
    ("sgd", sgd),
    ("sgd_tfidf", sgd_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ]

In [65]:
svc_tfidf_3500 = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3500)),
        ("linear svc", SVC(kernel="linear"))
    ])

svc_tfidf_4500 = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=4500)),
        ("linear svc", SVC(kernel="linear"))
    ])

all_models = [
    ("svc_tfidf_3500", svc_tfidf_3500),
    ("svc_tfidf_4500", svc_tfidf_4500)
    ]

In [87]:
unsorted_scores = [(name, cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1).mean()) for name, model in all_models]

In [88]:
scores = sorted(unsorted_scores, key=lambda x: -x[1])
scores

[('svc', 0.99),
 ('sgd_tfidf', 0.9857142857142858),
 ('svc_tfidf', 0.9857142857142858),
 ('sgd', 0.9757142857142858)]

##### model

In [89]:
model = svc
model.fit(X_train, y_train);

In [90]:
y_pred = model.predict(X_test)

In [91]:
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       142
           1       1.00      1.00      1.00       166
           2       1.00      0.99      0.99       147
           3       0.99      1.00      0.99       145

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



0.9966666666666667

#### all articles 

0.988 svc_tfidf cv7 vec3000

0.989 svc_tfidf cv5 vec4000

0.989 svc_tfidf cv5 vec4500

#### 1000 per cat

0.9925 svc_tfidf cv5 vec3000

#### 500 per cat

0.996 svc cv5 vec3000

## RandomForest

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [4]:
data = load_files('data/StockMoon_categories', encoding="utf-8", decode_error="replace")

In [5]:
text_train, text_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)

In [45]:
len(text_train), len(text_test)

(13701, 5873)

In [46]:
cv = CountVectorizer()
cv.fit(text_train)
joblib.dump(cv, r'../StockMoon/economy_classifier_CV.sav')
X_train = cv.transform(text_train)
X_test = cv.transform(text_test)

In [13]:
text_train

13701

In [8]:
list(t[:120] for t in text_train[:10])

['Rusal Aluminum Prices Will Recover, Still Cutting OutputAluminum producer UC Rusal PLC said Monday it expects prices to ',
 'Nickel Prices Fall as Last Month�s Gains Become a Dead Cat BounceThe monthly Stainless MMI(r) registered a value of 73 i',
 '65% Initial Countervailing Duty Tariffs on Welded Carbon Steel Pipe From PakistanYesterday, Commerce Department placed i',
 'Oil rose on Tuesday, supported by expectations of an extension next week to OPEC output cuts, but prices remained under ',
 "NI sheep farmers would be �out of business� after no-dealIvor Ferguson, president of the Ulster Farmers' Union (UFU), ha",
 'Arla bosses �alert and prepared� for �Brexit damage�Arla bosses have warned they remain cautious over the "potential adv',
 'Fane Valley acquires Silver Hill FoodsFane Valley Group has acquired Silver Hill Foods, the Monaghan-based Irish duck pr',
 "Nasdaq plans to launch Bitcoin futuresAdena Friedman, CEO of Nasdaq, the world's second largest exchange, said in an int",


#### params

In [25]:
forest = RandomForestClassifier(n_jobs=-1, random_state=17)
forest_params = {'n_estimators': np.arange(198, 203, 2)}

grid_forest = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1)

In [26]:
grid_forest.fit(X_train, y_train);

In [27]:
grid_forest.best_params_, grid_forest.best_score_

({'n_estimators': 200}, 0.9900006391188683)

### Model

In [28]:
best_forest = RandomForestClassifier(n_estimators=176, n_jobs=-1, random_state=17)
best_forest.fit(X_train, y_train);

In [29]:
y_pred = best_forest.predict(X_test)
y_pred[:, np.newaxis]

array([[0],
       [1],
       [3],
       ...,
       [0],
       [0],
       [1]])

In [30]:
y_pred_prob = best_forest.predict_proba(X_test)
y_pred_prob

array([[0.93181818, 0.02272727, 0.01704545, 0.02840909],
       [0.09659091, 0.42045455, 0.35227273, 0.13068182],
       [0.34659091, 0.07386364, 0.09659091, 0.48295455],
       ...,
       [0.47159091, 0.03409091, 0.31818182, 0.17613636],
       [0.96590909, 0.        , 0.02840909, 0.00568182],
       [0.00568182, 0.96590909, 0.01136364, 0.01704545]])

In [31]:
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1583
           1       1.00      0.99      1.00      1002
           2       0.98      0.99      0.99      1335
           3       0.99      0.99      0.99      1953

    accuracy                           0.99      5873
   macro avg       0.99      0.99      0.99      5873
weighted avg       0.99      0.99      0.99      5873



0.9906351098246211

### Saving

In [33]:
import joblib

In [35]:
joblib.dump(best_forest, 'RF_.99_ecclassifier.sav')

['RF_.99_ecclassifier.sav']

In [42]:
loaded_model = joblib.load('data/StockMoon_categories/economy_classifier_RF.sav')
y_pred_loaded = loaded_model.predict(X_test)

In [43]:
accuracy_score(y_test, y_pred_loaded)

0.9906351098246211

#### 0.992 RF n176 all in cat 

#### 0.993 RF n176 500 per cat

#### 0 RF n176 all in cat test.9 