In [686]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

1.Text Cleaning and Preparation: cleaning of special characters, downcasing, punctuation signs. possessive pronouns and stop words removal and lemmatization.
2.Label coding: creation of a dictionary to map each category to a code.
3.Train-test split: to test the models on unseen data.
4.Text representation: use of TF-IDF scores to represent text.

In [702]:
# load data
df = pd.read_csv('training.csv')
df.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [701]:
# text cleaning and preparation
df['content'] = df['article_words'].str.replace(',', ' ')
df['content'].head()

0    open absent cent cent cent stock inflow rate k...
1    morn stead end end day day day patch patch pat...
2    socc socc world world recent law fifa fifa fif...
3    open forint forint forint forint cent cent ste...
4    morn complet weekend minut minut minut arrow d...
Name: content, dtype: object

In [689]:
# label coding
topic_codes = {
    'IRRELEVANT': 0,
    'ARTS CULTURE ENTERTAINMENT': 1,
    'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
    'DEFENCE': 3,
    'DOMESTIC MARKETS': 4,
    'FOREX MARKETS': 5,
    'HEALTH': 6,
    'MONEY MARKETS': 7,
    'SCIENCE AND TECHNOLOGY': 8,
    'SHARE LISTINGS': 9,
    'SPORTS': 10 
}

In [703]:
df['topic_code'] = df['topic']
df = df.replace({'topic_code':topic_codes})
df.head()

Unnamed: 0,article_number,article_words,topic,topic_code
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS,5
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS,7
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS,10
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS,5
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT,0


In [691]:
# train and split
X_train, X_test, y_train, y_test = train_test_split(df['content'], 
                                                    df['topic_code'], 
                                                    test_size=0.0526, 
                                                    random_state=8)


In [692]:
# text representation
# Parameter election
ngram_range = (1,1)
min_df = 0.04
max_df = 0.3
max_features = 210

In [693]:
# TF_IDF
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        sublinear_tf=True)

tf_fit = tfidf.fit_transform(X_train)
features_train = tf_fit.toarray()
labels_train = y_train
# print(tfidf.get_feature_names())
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(9000, 210)
(500, 210)


Cross-Validation for Hyperparameter tuning

In [704]:
# C
C = [0.01, 0.1, 1, 10, 100]

# gamma
gamma = [0.001, 0.01, 0.1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# probability
probability = [True]

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

{'C': [1, 10, 100],
 'degree': [1, 2, 3, 4, 5],
 'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
 'kernel': ['linear', 'rbf', 'poly'],
 'probability': [True]}


In [705]:
# Randomized Search Cross Validation
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 151.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=8,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=50, n_jobs=None,
                   param_distributions={'C': [1, 10, 100],
                                        'degree': [1, 2, 3, 4, 5],
                                        'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                                        'kernel': ['linear', 'rbf', 'poly'],
                                        'probability': [True]},
                   pre_dispatch='2*n_jobs', random_state=8, refit=True,
                   return_train_score=False, sc

In [706]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)


The best hyperparameters from Random Search are:
{'probability': True, 'kernel': 'rbf', 'gamma': 1, 'degree': 5, 'C': 1}

The mean accuracy of a model with these hyperparameters is:
0.7403333333333334


In [697]:
# Create the parameter grid based on the results of random search 
C = [0.1, 1, 10, 100]
gamma = [1, 10, 100]

param_grid = [
  {'C': C, 'kernel':['rbf'], 'gamma':gamma}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 14.8min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.1, 1, 10, 100], 'gamma': [1, 10, 100],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [698]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

The mean accuracy of a model with these hyperparameters is:
0.734006734006734


In [707]:
best_svc = svm.SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma=1, kernel='rbf', max_iter=-1,
                                 probability=False, random_state=8,
                                 shrinking=True, tol=0.001, verbose=False)

best_svc.fit(features_train, labels_train)

svc_pred = best_svc.predict(features_test)

# # pprint(best_svc.get_params())

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))


# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, svc_pred))

The training accuracy is: 
0.8597777777777778
The test accuracy is: 
0.748


In [None]:
base_model = svm.SVC(random_state = 8)
base_model.fit(features_train, labels_train)

print('base model test accuracy score is: ',accuracy_score(labels_test, base_model.predict(features_test)))
print('base model train accuracy score is: ',accuracy_score(labels_train, base_model.predict(features_train)))
best_svc.fit(features_train, labels_train)
print('best svm model accuracy score is: ', accuracy_score(labels_test, best_svc.predict(features_test)))

base model test accuracy score is:  0.746
