In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer	#nbc means naive bayes classifier
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
import pandas as pd

In [2]:
# extract data
df_train = pd.read_csv('training.csv')
train_set = pd.DataFrame(columns=['words', 'topic'])
train_set['words'] = df_train['article_words'].str.replace(',',' ')
train_set['topic'] = df_train['topic']

# split the data to train set and test set
X_train, X_test, y_train, y_test = train_test_split(train_set['words'], train_set['topic'], test_size = 0.0526, random_state = 1)

# encoder the y
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)


In [3]:
#WordLevel tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_set['words'])
xtrain_tfidf = tfidf_vect.transform(X_train)
xtest_tfidf = tfidf_vect.transform(X_test)

# NgramLevel tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_set['words'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

# CharLevel tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_set['words'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
xtest_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)


In [4]:
#  solver='lbfgs'

# Linear classifier characterized by word-level TF-IDF vector
lg1 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0).fit(xtrain_tfidf, y_train)
predictions = lg1.predict(xtest_tfidf)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear classifier characterized by multiple word-level TF-IDF vectors
lg2 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0).fit(xtrain_tfidf_ngram, y_train)
predictions = lg2.predict(xtest_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, N-Gram Vectors TF-IDF: ", accuracy)


# Linear classifier characterized by part-of-speech TF-IDF vector
lg3 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='lbfgs', tol=0.0001, verbose=0).fit(xtrain_tfidf_ngram_chars, y_train)
predictions = lg3.predict(xtest_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, CharLevel Vectors TF-IDF: ", accuracy)


LR, WordLevel TF-IDF:  0.782
LR, N-Gram Vectors TF-IDF:  0.752
LR, CharLevel Vectors TF-IDF:  0.762


In [5]:
#solver='newton-cg'

# Linear classifier characterized by word-level TF-IDF vector
lg_1 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='newton-cg', tol=0.0001, verbose=0).fit(xtrain_tfidf, y_train)
predictions = lg1.predict(xtest_tfidf)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear classifier characterized by multiple word-level TF-IDF vectors
lg_2 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='newton-cg', tol=0.0001, verbose=0).fit(xtrain_tfidf_ngram, y_train)
predictions = lg2.predict(xtest_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, N-Gram Vectors TF-IDF: ", accuracy)


# Linear classifier characterized by part-of-speech TF-IDF vector
lg_3 = linear_model.LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=2000, multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='newton-cg', tol=0.0001, verbose=0).fit(xtrain_tfidf_ngram_chars, y_train)
predictions = lg3.predict(xtest_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, y_test)
print("LR, CharLevel Vectors TF-IDF: ", accuracy)


LR, WordLevel TF-IDF:  0.782
LR, N-Gram Vectors TF-IDF:  0.752
LR, CharLevel Vectors TF-IDF:  0.762


In [6]:
# choose solver='lbfgs',  N-Gram Vectors TF-IDF
predicted_y = lg2.predict(xtest_tfidf_ngram)
target = ['ARTS CULTURE ENTERTAINMENT',
    'BIOGRAPHIES PERSONALITIES PEOPLE',
    'DEFENCE',
    'DOMESTIC MARKETS',
    'FOREX MARKETS',
    'HEALTH',
    'RRELEVANT',
    'MONEY MARKETS',
    'SCIENCE AND TECHNOLOGY',
    'SHARE LISTINGS',
    'SPORTS'
    ]

print(classification_report(y_test, predicted_y,target_names=target))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.75      0.60      0.67         5
BIOGRAPHIES PERSONALITIES PEOPLE       0.67      0.57      0.62         7
                         DEFENCE       0.82      0.82      0.82        11
                DOMESTIC MARKETS       0.33      0.17      0.22         6
                   FOREX MARKETS       0.32      0.34      0.33        41
                          HEALTH       0.50      0.44      0.47         9
                       RRELEVANT       0.84      0.89      0.86       249
                   MONEY MARKETS       0.60      0.58      0.59        99
          SCIENCE AND TECHNOLOGY       0.75      1.00      0.86         3
                  SHARE LISTINGS       1.00      0.50      0.67        12
                          SPORTS       0.96      0.93      0.95        58

                       micro avg       0.75      0.75      0.75       500
                       macro avg    

In [7]:
df_test = pd.read_csv('test.csv')
test_set = pd.DataFrame(columns=['words', 'topic'])
x = df_test['article_words'].str.replace(',',' ')
y = df_test['topic']

tfidf_vect_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(x)
x_ngram = tfidf_vect_ngram.transform(x)

y = encoder.fit_transform(y)


predicted_y = lg2.predict(x_ngram)
print(classification_report(y, predicted_y,target_names=target))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.00      0.00      0.00         3
BIOGRAPHIES PERSONALITIES PEOPLE       0.00      0.00      0.00        15
                         DEFENCE       0.00      0.00      0.00        13
                DOMESTIC MARKETS       0.00      0.00      0.00         2
                   FOREX MARKETS       0.00      0.00      0.00        48
                          HEALTH       0.00      0.00      0.00        14
                       RRELEVANT       0.55      0.99      0.70       266
                   MONEY MARKETS       0.00      0.00      0.00        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.00      0.00      0.00         7
                          SPORTS       0.64      0.12      0.20        60

                       micro avg       0.54      0.54      0.54       500
                       macro avg    

  'precision', 'predicted', average, warn_for)
