In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import model_selection, linear_model
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Data Importing

In [2]:
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

train_x = train[:9000]
train_y = train['topic'][:9000]

test_x = train[9000:]
test_y = train['topic'][9000:]

# Base Model

In [3]:
count_vec = CountVectorizer()
count_vec.fit(train_x['article_words'])

train_x_vec = count_vec.transform(train_x['article_words'])
test_x_vec = count_vec.transform(test_x['article_words'])

SVM = linear_model.SGDClassifier(loss="modified_huber", shuffle=False)
SVM.fit(train_x_vec, train_y)
predictions_SVM = SVM.predict(test_x_vec)
print(f'Accuracy for baseline model - {accuracy_score(predictions_SVM, test_y)}')
print(classification_report(predictions_SVM, test_y))

Accuracy for baseline model - 0.74
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.29      0.50      0.36         4
BIOGRAPHIES PERSONALITIES PEOPLE       0.44      0.57      0.50         7
                         DEFENCE       0.45      0.50      0.48        10
                DOMESTIC MARKETS       0.33      0.33      0.33         6
                   FOREX MARKETS       0.35      0.37      0.36        41
                          HEALTH       0.50      0.71      0.59         7
                      IRRELEVANT       0.82      0.86      0.84       249
                   MONEY MARKETS       0.68      0.53      0.60       101
          SCIENCE AND TECHNOLOGY       1.00      0.67      0.80         3
                  SHARE LISTINGS       0.71      0.71      0.71         7
                          SPORTS       0.97      0.95      0.96        65

                        accuracy                           0.74       500
 

In [4]:
def test_model(vectorizer=TfidfVectorizer(stop_words = 'english'), k='all', sampler=None, model=None):
    
    # Feature Extraction
    vectorizer.fit(train_x['article_words'])
    train_x_vec = vectorizer.transform(train_x['article_words'])
    test_x_vec = vectorizer.transform(test_x['article_words'])
    
    # Feature Selection
    feature_selector = SelectKBest(chi2, k=k)
    feature_selector.fit(train_x_vec, train_y)
    train_x_vec = feature_selector.transform(train_x_vec)
    test_x_vec = feature_selector.transform(test_x_vec)
    print(train_x_vec.shape)
    
    model = linear_model.SGDClassifier(loss="modified_huber", shuffle=False)

    # Sampling
    if sampler:
        train_x_res, train_y_res = sampler.fit_resample(train_x_vec, train_y)
        print(train_x_res.shape)
        model.fit(train_x_res, train_y_res)
        predictions = model.predict(test_x_vec)
        print(f'accuracy_score = {accuracy_score(predictions, test_y)}')

    else:
        model.fit(train_x_vec, train_y)
        predictions = model.predict(test_x_vec)
        print(f'accuracy_score = {accuracy_score(predictions, test_y)}')
    
    print()

# Feature Extraction Tuning

In [5]:
vectorizers = [CountVectorizer(),
               CountVectorizer(ngram_range=(1,3)),
               CountVectorizer(min_df = 2),
               CountVectorizer(stop_words="english", ngram_range=(1,3), min_df = 2),
               TfidfVectorizer(stop_words = 'english')
              ]
for v in vectorizers:
    print(v)
    test_model(v)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
(9000, 34821)
accuracy_score = 0.74

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
(9000, 1012505)
accuracy_score = 0.772

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<

# Feature selection

In [6]:
k_values = [100, 1000, 2500, 5000, 7500, 10000, 30000]

for value in k_values:
    test_model(k=value)

(9000, 100)
accuracy_score = 0.734

(9000, 1000)
accuracy_score = 0.76

(9000, 2500)
accuracy_score = 0.774

(9000, 5000)
accuracy_score = 0.79

(9000, 7500)
accuracy_score = 0.784

(9000, 10000)
accuracy_score = 0.782

(9000, 30000)
accuracy_score = 0.786



# Data sampling

In [7]:
samplers = [RandomOverSampler(), RandomUnderSampler(), SMOTE()]
for s in samplers:
    print(s)
    test_model(sampler=s)

RandomOverSampler(random_state=None, sampling_strategy='auto')
(9000, 34724)
(49203, 34724)
accuracy_score = 0.642

RandomUnderSampler(random_state=None, replacement=False,
                   sampling_strategy='auto')
(9000, 34724)
(748, 34724)
accuracy_score = 0.482

SMOTE(k_neighbors=5, n_jobs=None, random_state=None, sampling_strategy='auto')
(9000, 34724)
(49203, 34724)
accuracy_score = 0.656



# Hyperparameter Tuning

In [8]:
SVM = linear_model.SGDClassifier(loss="modified_huber", shuffle=False)

# Best vectorizer and feature selection
vectorizer=TfidfVectorizer(stop_words = 'english')
k=5000

# Feature Extraction
vectorizer.fit(train_x['article_words'])
train_x_vec = vectorizer.transform(train_x['article_words'])
test_x_vec = vectorizer.transform(test_x['article_words'])

# Feature Selection
feature_selector = SelectKBest(chi2, k=k)
feature_selector.fit(train_x_vec, train_y)
train_x_vec = feature_selector.transform(train_x_vec)
test_x_vec = feature_selector.transform(test_x_vec)

# Hyperparameter tuning
parameters = [{'alpha': [0.0001, 0.01, 1, 100],'class_weight': [None, 'balanced']}]

#gridsearch
grid_search = model_selection.GridSearchCV(estimator = SVM, param_grid = parameters, scoring = 'accuracy', cv = 10)
grid_search = grid_search.fit(train_x_vec, train_y)
model = grid_search.best_estimator_
print(model)


SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=False, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


# Final Model

In [9]:
model.fit(train_x_vec, train_y)
test_predictions_SVM = model.predict(test_x_vec)
print(f'accuracy_score = {accuracy_score(test_predictions_SVM, test_y)}')
print(classification_report(test_predictions_SVM, test_y))

accuracy_score = 0.792
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.43      0.75      0.55         4
BIOGRAPHIES PERSONALITIES PEOPLE       0.44      0.50      0.47         8
                         DEFENCE       0.64      0.50      0.56        14
                DOMESTIC MARKETS       0.67      0.57      0.62         7
                   FOREX MARKETS       0.58      0.43      0.50        58
                          HEALTH       0.70      0.78      0.74         9
                      IRRELEVANT       0.85      0.94      0.89       237
                   MONEY MARKETS       0.66      0.62      0.64        85
          SCIENCE AND TECHNOLOGY       1.00      0.67      0.80         3
                  SHARE LISTINGS       0.86      0.67      0.75         9
                          SPORTS       0.98      0.95      0.97        66

                        accuracy                           0.79       500
             

# Suggested Articles (for test dataset)

In [10]:
#get suggested articles

final_test_x = vectorizer.transform(test['article_words'])
final_test_x = feature_selector.transform(final_test_x)

print(final_test_x.shape)

final_test_y = test['topic']

final_test_predictions = model.predict(final_test_x)
print(f'accuracy_score = {accuracy_score(final_test_predictions, final_test_y)}')
print(classification_report(final_test_predictions, test_y))
print()

test_probabilities_SVM = model.predict_proba(final_test_x)

for i in range(len(model.classes_)):
    topic = model.classes_[i]
    article_indexes = range(len(test_predictions_SVM))
    classified_articles = list(filter(lambda x: test_predictions_SVM[x] == topic, article_indexes))
    articles_with_probabilities = list(map(lambda x: (test_probabilities_SVM[x][i], test['article_number'].values[x]), classified_articles))
    suggested_articles = sorted(articles_with_probabilities, reverse=True)[:10]
    
    print(f'Total classified articles as {topic} = {len(classified_articles)}')   
    print(f'Suggested articles = {[x[1] for x in suggested_articles]}')
    print()


(500, 5000)
accuracy_score = 0.774
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.00      0.00      0.00         6
BIOGRAPHIES PERSONALITIES PEOPLE       0.00      0.00      0.00         6
                         DEFENCE       0.00      0.00      0.00        17
                DOMESTIC MARKETS       0.00      0.00      0.00         6
                   FOREX MARKETS       0.07      0.06      0.07        48
                          HEALTH       0.00      0.00      0.00        14
                      IRRELEVANT       0.55      0.57      0.56       252
                   MONEY MARKETS       0.17      0.17      0.17        81
          SCIENCE AND TECHNOLOGY       0.50      0.50      0.50         2
                  SHARE LISTINGS       0.00      0.00      0.00         6
                          SPORTS       0.20      0.21      0.21        62

                        accuracy                           0.35       500
 