In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import decomposition, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer	#nbc means naive bayes classifier
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem

import pandas as pd

In [4]:
# extract data
df_train = pd.read_csv('training.csv')
train_set = pd.DataFrame(columns=['words', 'topic'])
train_set['words'] = df_train['article_words'].str.replace(',',' ')
train_set['topic'] = df_train['topic']

# split the data to train set and test set
X_train, X_test, y_train, y_test = train_test_split(train_set['words'], train_set['topic'], test_size = 0.0526, random_state = 1)


# encoder the y
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [3]:
#tf-idf

#WordLevel tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_set['words'])
xtrain_tfidf = tfidf_vect.transform(X_train)
xtest_tfidf = tfidf_vect.transform(X_test)

# ngram tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_set['words'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

#charlevel tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_set['words'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
xtest_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)


In [4]:
cv = KFold(10, shuffle=True, random_state=0)
cv.get_n_splits(xtrain_tfidf)
# cv.get_n_splits(X_train)
scores = cross_val_score(MultinomialNB(), xtrain_tfidf, y_train, cv=cv)
print(scores)
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001),
}
grid_search = GridSearchCV(MultinomialNB(), parameters, n_jobs=1, cv = cv)
grid_search.fit(xtrain_tfidf, y_train)
print(grid_search.best_params_)
print ("Best score: %0.3f" % grid_search.best_score_)

m1 = MultinomialNB(alpha = 0.01).fit(xtrain_tfidf, y_train)
predictions = m1.predict(xtest_tfidf)
print("MNB, WordLevel TF-IDF: ",metrics.accuracy_score(predictions, y_test))

[0.71555556 0.73666667 0.73333333 0.76333333 0.72222222 0.74
 0.73222222 0.74777778 0.73333333 0.76555556]
{'alpha': 0.01}
Best score: 0.758
NB, WordLevel TF-IDF:  0.76


In [5]:
cv = KFold(10, shuffle=True, random_state=0)
cv.get_n_splits(xtrain_tfidf_ngram)
# cv.get_n_splits(X_train)
scores = cross_val_score(MultinomialNB(), xtrain_tfidf_ngram, y_train, cv=cv)
print(scores)
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001),
}
grid_search = GridSearchCV(MultinomialNB(), parameters, n_jobs=1, cv = cv)
grid_search.fit(xtrain_tfidf_ngram, y_train)
print(grid_search.best_params_)
print ("Best score: %0.3f" % grid_search.best_score_)

m2 = MultinomialNB(alpha = 0.01).fit(xtrain_tfidf_ngram, y_train)
predictions = m2.predict(xtest_tfidf_ngram)
print( "MNB, N-Gram Vectors: ",metrics.accuracy_score(predictions, y_test))


[0.73333333 0.73333333 0.73222222 0.75555556 0.73       0.74222222
 0.72333333 0.74777778 0.71888889 0.75444444]
{'alpha': 0.1}
Best score: 0.750
NB, N-Gram Vectors:  0.764


In [6]:
cv = KFold(10, shuffle=True, random_state=0)
cv.get_n_splits(xtrain_tfidf_ngram_chars)
# cv.get_n_splits(X_train)
scores = cross_val_score(MultinomialNB(), xtrain_tfidf_ngram_chars, y_train, cv=cv)
print(scores)
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001),
}
grid_search = GridSearchCV(MultinomialNB(), parameters, n_jobs=1, cv = cv)
grid_search.fit(xtrain_tfidf_ngram_chars, y_train)
print(grid_search.best_params_)
print ("Best score: %0.3f" % grid_search.best_score_)

m3 = MultinomialNB(alpha = 0.01).fit(xtrain_tfidf_ngram_chars, y_train)
predictions = m3.predict(xtest_tfidf_ngram_chars)
print("MNB, CharLevel Vectors: ",metrics.accuracy_score(predictions, y_test))

[0.69444444 0.70888889 0.70444444 0.73       0.69777778 0.71444444
 0.70555556 0.71       0.69888889 0.73111111]
{'alpha': 0.01}
Best score: 0.744
NB, CharLevel Vectors:  0.756


In [5]:
#Create a vector counter object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_set['words'])

#Use vector counter object to convert training set and validation set
xtrain_count = count_vect.transform(X_train)
xvalid_count = count_vect.transform(X_test)
print(len(count_vect.get_feature_names()))
print(len(count_vect.vocabulary_))

35823
35823


In [7]:
cv = KFold(10, shuffle=True, random_state=0)
cv.get_n_splits(xtrain_count)
scores = cross_val_score(MultinomialNB(), xtrain_count, y_train, cv=cv)
print(scores)
parameters = {
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001),
}
grid_search = GridSearchCV(MultinomialNB(), parameters, n_jobs=1, cv = cv)
grid_search.fit(xtrain_count, y_train)
print(grid_search.best_params_)
print ("Best score: %0.3f" % grid_search.best_score_)

m_count = MultinomialNB(alpha = 1).fit(xtrain_count, y_train)
predictions = m1.predict(xvalid_count)
print("NB, Count Vectors: ",metrics.accuracy_score(predictions, y_test))

[0.71777778 0.75888889 0.73555556 0.77111111 0.73222222 0.72222222
 0.71888889 0.73111111 0.72666667 0.74666667]
{'alpha': 1}
Best score: 0.736
NB, Count Vectors:  0.738


In [8]:
predicted_y = m_count.predict(xvalid_count)
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='macro'))
target = ['ARTS CULTURE ENTERTAINMENT',
    'BIOGRAPHIES PERSONALITIES PEOPLE',
    'DEFENCE',
    'DOMESTIC MARKETS',
    'FOREX MARKETS',
    'HEALTH',
    'RRELEVANT',
    'MONEY MARKETS',
    'SCIENCE AND TECHNOLOGY',
    'SHARE LISTINGS',
    'SPORTS'
    ]

print(classification_report(y_test, predicted_y, target_names=target))

0.738
0.738
0.738
0.738
0.6583666424276271
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.75      0.60      0.67         5
BIOGRAPHIES PERSONALITIES PEOPLE       0.62      0.71      0.67         7
                         DEFENCE       0.65      1.00      0.79        11
                DOMESTIC MARKETS       0.67      0.33      0.44         6
                   FOREX MARKETS       0.30      0.32      0.31        41
                          HEALTH       0.58      0.78      0.67         9
                       RRELEVANT       0.91      0.78      0.84       249
                   MONEY MARKETS       0.53      0.70      0.61        99
          SCIENCE AND TECHNOLOGY       0.60      1.00      0.75         3
                  SHARE LISTINGS       0.71      0.42      0.53        12
                          SPORTS       0.98      0.98      0.98        58

                        accuracy                           0.74    

In [9]:
df_test = pd.read_csv('test.csv')
test_set = pd.DataFrame(columns=['words', 'topic'])
x = df_test['article_words'].str.replace(',',' ')
y = df_test['topic']


count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_set['words'])
x_count = count_vect.transform(x)

y = encoder.fit_transform(y)


predicted_y = m1.predict(x_count)
print(set(y) - set(predicted_y))
# print(y_test, predicted_y)
# print(m2.predict_proba(xtest_tfidf_ngram))
print(accuracy_score(y, predicted_y))
# print(precision_score(y, predicted_y, average='micro'))
# print(recall_score(y, predicted_y, average='micro'))
# print(f1_score(y, predicted_y, average='micro'))
# print(f1_score(y, predicted_y, average='macro'))
print(classification_report(y, predicted_y,target_names=target))

{8, 3}
0.724
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      0.67      0.57         3
BIOGRAPHIES PERSONALITIES PEOPLE       0.78      0.47      0.58        15
                         DEFENCE       0.57      0.62      0.59        13
                DOMESTIC MARKETS       0.00      0.00      0.00         2
                   FOREX MARKETS       0.39      0.27      0.32        48
                          HEALTH       0.83      0.71      0.77        14
                       RRELEVANT       0.88      0.77      0.82       266
                   MONEY MARKETS       0.43      0.80      0.56        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       1.00      0.14      0.25         7
                          SPORTS       0.95      1.00      0.98        60

                        accuracy                           0.72       500
                       

  _warn_prf(average, modifier, msg_start, len(result))
