In [34]:
import pandas as pd
from sklearn.compose import ColumnTransformer
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [35]:
df = pd.read_csv('../data/cleaned/final-cleaned.csv')

In [36]:
df.articles = df.articles.astype('U')

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [38]:
y = np.array(df.category_id.values)

In [39]:
X = df[['category', 'articles']]

In [40]:
X

Unnamed: 0,category,articles
0,opinion,congress much protect american served country predatory profit college
1,world,attack civilian site syria grown frequent u n launch rare inquiry document violation investigation limit scope final report make public new york time inves
2,world,week relative calm end wednesday people march large number police deploy gas pepper spray
3,politics,president monitor middle east crisis golf club threaten iran implored reporter honorable
4,arts,linda ronstadt sound voice air cnn new season doctor kick bbc america
...,...,...
2336,business,former bos premium cable network talk former vice medium executive making documentary
2337,politics,lobby president washington hotel supporter remain unwavering business remain brisk
2338,sports,lakers first game bryant daughter gianna die helicopter crash sunday lebron james give heartfelt speech game
2339,sports,famed run coach already bar sport year dope violation temporarily ban u center safesport action produce lifetime ban


In [41]:
cv = CountVectorizer(max_features = 5000)

In [42]:
preprocess = ColumnTransformer(
    remainder='drop',
    transformers=[('cvec', cv, 'articles')],
)

In [43]:
preprocess

ColumnTransformer(transformers=[('cvec', CountVectorizer(max_features=5000),
                                 'articles')])

In [44]:
f = preprocess.fit_transform(X)

In [45]:
type(f)

scipy.sparse._csr.csr_matrix

In [46]:
f.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
X = pd.DataFrame(f.toarray(),columns=preprocess.get_feature_names())

In [48]:
X

Unnamed: 0,cvec__21st,cvec__49ers,cvec__50th,cvec__aaron,cvec__abandon,cvec__abbas,cvec__abbott,cvec__abc,cvec__abdicate,cvec__abdul,...,cvec__zephyr,cvec__zhao,cvec__zindani,cvec__zion,cvec__zionism,cvec__zoey,cvec__zone,cvec__zoning,cvec__zuberi,cvec__zverev
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
model = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)

In [50]:
from sklearn.pipeline import make_pipeline

In [51]:
pipe = make_pipeline(preprocess, model)

In [52]:
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cvec',
                                                  CountVectorizer(max_features=5000),
                                                  'articles')])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', random_state=0))])

In [53]:
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cvec',
                                                  CountVectorizer(max_features=5000),
                                                  'articles')])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', random_state=0))])

In [54]:
X.shape

(2341, 5000)

In [55]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

1638
703
1638
703


In [56]:
x_train.shape

(1638, 5000)

In [57]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(x_train, y_train)
classifier
y_pred = classifier.predict(x_test)

In [60]:
y_pred

array([2, 0, 0, 4, 1, 5, 4, 0, 0, 1, 5, 0, 5, 3, 0, 2, 1, 1, 0, 0, 0, 2,
       1, 2, 2, 1, 2, 0, 3, 0, 2, 1, 0, 3, 1, 0, 2, 0, 0, 0, 1, 2, 0, 1,
       0, 0, 3, 0, 0, 1, 5, 0, 2, 2, 0, 2, 5, 0, 1, 4, 4, 3, 1, 2, 3, 0,
       2, 3, 3, 2, 0, 0, 4, 3, 1, 0, 1, 2, 0, 0, 4, 2, 0, 3, 0, 2, 0, 0,
       2, 0, 2, 0, 5, 0, 2, 5, 4, 4, 2, 1, 2, 0, 3, 0, 0, 2, 5, 2, 0, 1,
       1, 2, 2, 2, 0, 1, 2, 3, 0, 1, 0, 0, 4, 1, 4, 2, 0, 2, 0, 2, 1, 1,
       1, 0, 5, 1, 2, 4, 0, 0, 5, 0, 0, 0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 1,
       2, 0, 0, 2, 2, 5, 1, 3, 0, 0, 1, 3, 2, 0, 0, 4, 3, 4, 2, 0, 0, 0,
       1, 2, 0, 0, 1, 1, 2, 2, 4, 3, 1, 5, 0, 2, 0, 0, 1, 1, 3, 1, 1, 0,
       1, 2, 4, 1, 5, 3, 2, 1, 2, 4, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 5, 0,
       0, 0, 0, 0, 3, 4, 1, 2, 1, 0, 1, 1, 3, 2, 0, 0, 0, 2, 2, 2, 4, 0,
       0, 2, 5, 0, 0, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 5, 1, 3, 2, 2,
       2, 3, 2, 2, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2, 2, 3, 5, 0, 3, 1, 1, 0,
       1, 2, 1, 4, 0, 0, 3, 0, 5, 2, 2, 4, 0, 5, 2,

In [58]:
text = ["I'm going to invest my company"]

In [59]:
y_pred1 = cv.transform(text)
yy = classifier.predict(y_pred1)
result = ""
if yy == [0]:
    result = "opinion"
elif yy == [1]:
    result = "world"
elif yy == [2]:
    result = "Politics News"
elif yy == [3]:
    result = "arts"
elif yy == [4]:
    result = "business"
elif yy == [5]:
    result = "sports"
print(result)

NotFittedError: Vocabulary not fitted or provided

In [None]:
y_pred1.shape