In [201]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn import svm, naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [202]:
def data_precessing(df):
    # text cleaning and preparation
    df['content'] = df['article_words'].str.replace(',', ' ')
    # label coding
    topic_codes = {
        'IRRELEVANT': 0,
        'ARTS CULTURE ENTERTAINMENT': 1,
        'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
        'DEFENCE': 3,
        'DOMESTIC MARKETS': 4,
        'FOREX MARKETS': 5,
        'HEALTH': 6,
        'MONEY MARKETS': 7,
        'SCIENCE AND TECHNOLOGY': 8,
        'SHARE LISTINGS': 9,
        'SPORTS': 10 
    }
    df['topic_code'] = df['topic']
    df = df.replace({'topic_code':topic_codes})
    return df

In [203]:
def data_augment(df):
    for i in range(len(df)):
        a=df.loc[i]
        d=pd.DataFrame(a).T
        if df.at[i,'topic'] != "IRRELEVANT":
            df=df.append([d]*2)  
    return df.reset_index(drop=True)

In [204]:
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('training.csv')

# with data augment
train_content_with_dataAugment = data_precessing(data_augment(df_train))['content']
train_with_dataAugment = data_precessing(data_augment(df_train))['topic_code']

X_train_content_with_dataAugment, X_test_content_with_dataAugment, y_train_with_dataAugment, y_test_with_dataAugment = train_test_split(train_content_with_dataAugment, 
                                                    train_with_dataAugment, 
                                                    test_size=0.0526, 
                                                    random_state=8)
# without data augment
train_content_without_dataAugment = data_precessing(df_train)['content']
train_without_dataAugment = data_precessing(df_train)['topic_code']
X_train_content_without_dataAugment, X_test_content_without_dataAugment, y_train_without_dataAugment, y_test_without_dataAugment = train_test_split(train_content_without_dataAugment, 
                                                    train_without_dataAugment, 
                                                    test_size=0.0526, 
                                                    random_state=8)
# test data
X_test_content = data_precessing(df_test)['content']
y_test = data_precessing(df_test)['topic_code']


In [205]:
def text_representation(content, sign):
    # text representation
    # Parameter election
    ngram_range = (1,1)
    min_df = 0.04
    max_df = 0.3
    max_features = 400
    # TF_IDF
    tfidf = TfidfVectorizer(encoding='utf-8',
                            ngram_range=ngram_range,
                            stop_words=None,
                            lowercase=False,
                            max_df=max_df,
                            min_df=min_df,
                            max_features=max_features,
                            sublinear_tf=True)
    
    if sign == 0:
        tf_fit = tfidf.fit_transform(content)
    else:
        tf_fit = tfidf.transform(content)
    features = tf_fit.toarray()
    return features
# X_train = text_representation(X_train_content,0)
# X_test = text_representation(X_test_content,1)

In [211]:
count_vectorizer_with = CountVectorizer()
count_vectorizer_without = CountVectorizer()
# with data augment
X_train_with_dataAugment = count_vectorizer_with.fit_transform(X_train_content_with_dataAugment)
X_test_with_dataAugment = count_vectorizer_with.transform(X_test_content_with_dataAugment)

# without data augment
X_train_without_dataAugment = count_vectorizer_without.fit_transform(X_train_content_without_dataAugment)
X_test_without_dataAugment = count_vectorizer_without.transform(X_test_content_without_dataAugment)

# test data
X_test_with = count_vectorizer_with.transform(X_test_content)
X_test_without = count_vectorizer_without.transform(X_test_content)
print(X_test_with.shape)
print(X_test_without.shape)

(500, 35261)
(500, 34839)


In [218]:
# model
# best_svc = svm.SVC(C=0.1, gamma=1, kernel='rbf', max_iter=-1, probability=True, random_state=8)
with_dataAugment_svm = svm.SVC()
without_dataAugment_svm = svm.SVC()
# nb = naive_bayes.MultinomialNB()

# best_svc.fit(X_train, y_train)
with_dataAugment_svm.fit(X_train_with_dataAugment,y_train_with_dataAugment)
without_dataAugment_svm.fit(X_train_without_dataAugment,y_train_without_dataAugment)

# nb.fit(X_train, y_train)
# train_pred = nb.predict(X_train)
# test_pred = nb.predict(X_test)


# with data augment
train_pred_with_dataAugment = with_dataAugment_svm.predict(X_train_with_dataAugment)
test_pred_with_dataAugment = with_dataAugment_svm.predict(X_test_with_dataAugment)

# without data augment
train_pred_without_dataAugment = without_dataAugment_svm.predict(X_train_without_dataAugment)
test_pred_without_dataAugment = without_dataAugment_svm.predict(X_test_without_dataAugment)


# With data augment accuracy
print("The training accuracy with data augment is: ", accuracy_score(y_train_with_dataAugment,train_pred_with_dataAugment))
print("The test accuracy with data augment is: ", accuracy_score(y_test_with_dataAugment,test_pred_with_dataAugment))


# Without data augment accuracy
print("The training accuracy without data augment is: ", accuracy_score(y_train_without_dataAugment,train_pred_without_dataAugment))
print("The test accuracy without data augment is: ", accuracy_score(y_test_without_dataAugment,test_pred_without_dataAugment))


The training accuracy with data augment is:  0.941985579589573
The test accuracy with data augment is:  0.9001996007984032
The training accuracy without data augment is:  0.897
The test accuracy without data augment is:  0.748


In [214]:
features_name = ['IRRELEVANT',
    'ARTS CULTURE ENTERTAINMENT',
    'BIOGRAPHIES PERSONALITIES PEOPLE',
    'DEFENCE',
    'DOMESTIC MARKETS',
    'FOREX MARKETS',
    'HEALTH',
    'MONEY MARKETS',
    'SCIENCE AND TECHNOLOGY',
    'SHARE LISTINGS',
    'SPORTS'
]
# with data augment
print("train_report_with_dataAugment: ")
train_report_with_dataAugment = classification_report(y_train_with_dataAugment, train_pred_with_dataAugment,target_names=features_name)
print(train_report_with_dataAugment)

print("test_report_with_dataAugment: ")
test_report_with_dataAugment = classification_report(y_test_with_dataAugment, test_pred_with_dataAugment,target_names=features_name)
print(test_report_with_dataAugment)



train_report_with_dataAugment: 
                                  precision    recall  f1-score   support

                      IRRELEVANT       0.98      0.93      0.96      4466
      ARTS CULTURE ENTERTAINMENT       0.99      0.98      0.98       331
BIOGRAPHIES PERSONALITIES PEOPLE       0.99      0.99      0.99       470
                         DEFENCE       0.98      0.99      0.99       736
                DOMESTIC MARKETS       0.98      0.99      0.98       378
                   FOREX MARKETS       0.89      0.80      0.85      2409
                          HEALTH       0.99      0.99      0.99       516
                   MONEY MARKETS       0.88      0.95      0.91      4761
          SCIENCE AND TECHNOLOGY       1.00      1.00      1.00       203
                  SHARE LISTINGS       0.96      0.98      0.97       630
                          SPORTS       0.98      1.00      0.99      3130

                        accuracy                           0.94     18030
    

In [215]:
# without data augment
print("train_report_without_dataAugment: ")
train_report_without_dataAugment = classification_report(y_train_without_dataAugment, train_pred_without_dataAugment,target_names=features_name)
print(train_report_without_dataAugment)

print("test_report_without_dataAugment: ")
test_report_with_dataAugment = classification_report(y_test_without_dataAugment, test_pred_without_dataAugment,target_names=features_name)
print(test_report_with_dataAugment)


train_report_without_dataAugment: 
                                  precision    recall  f1-score   support

                      IRRELEVANT       0.92      0.95      0.94      4471
      ARTS CULTURE ENTERTAINMENT       0.97      0.63      0.77       111
BIOGRAPHIES PERSONALITIES PEOPLE       0.99      0.74      0.85       159
                         DEFENCE       0.98      0.88      0.93       242
                DOMESTIC MARKETS       0.94      0.71      0.81       129
                   FOREX MARKETS       0.86      0.61      0.71       799
                          HEALTH       1.00      0.80      0.89       168
                   MONEY MARKETS       0.78      0.92      0.84      1596
          SCIENCE AND TECHNOLOGY       1.00      0.72      0.84        68
                  SHARE LISTINGS       0.97      0.71      0.82       210
                          SPORTS       0.95      1.00      0.97      1047

                        accuracy                           0.90      9000
 

In [219]:
# test
test_pred_with = with_dataAugment_svm.predict(X_test_with)
print("The test accuracy with data augment is: ", accuracy_score(y_test,test_pred_with))
print("test_report_with data augment: ")
test_report_with = classification_report(y_test, test_pred_with, target_names=features_name)
print(test_report_with)

test_pred_without = without_dataAugment_svm.predict(X_test_without)
print("The test accuracy without is: ", accuracy_score(y_test,test_pred_without))
print("test_report_without: ")
test_report_without = classification_report(y_test, test_pred_without, target_names=features_name)
print(test_report_without)


The test accuracy with data augment is:  0.762
test_report_with data augment: 
                                  precision    recall  f1-score   support

                      IRRELEVANT       0.84      0.88      0.86       266
      ARTS CULTURE ENTERTAINMENT       0.50      0.33      0.40         3
BIOGRAPHIES PERSONALITIES PEOPLE       1.00      0.33      0.50        15
                         DEFENCE       1.00      0.38      0.56        13
                DOMESTIC MARKETS       0.33      0.50      0.40         2
                   FOREX MARKETS       0.50      0.35      0.41        48
                          HEALTH       0.67      0.43      0.52        14
                   MONEY MARKETS       0.53      0.72      0.61        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.71      0.71      0.71         7
                          SPORTS       0.95      0.93      0.94        60

                        accurac

  _warn_prf(average, modifier, msg_start, len(result))


In [217]:
# use data with data augment to predict
print(test_pred_with)


[ 0  0  7  0  0  5  0 10  7  0  0  0 10 10  0  7  0  9  0 10  0  0  0  0
  7  2  0  0  5  5  7  0  0  7  0 10  0  0  7  0 10  7  0  0  0  0  7  5
  0  7  5  0  7  7  7  0  0  0  3  7  0  9  0  0  5  0  0 10 10  7  7  5
 10 10  0  3  5  0  0 10  0 10  7  5  0  7  7  5  0  0  0  0  0  7  0 10
  0  0  7  7  9  7  0  0  0  0  0 10  0 10  0  0  0  0  0  0  8  7  0 10
  6  0  0  0  7  0  7  0  0 10  0  5  0  7  0  0  0  7  0  4  7  0  0  0
  0  0  7  0  0  7  0  0 10  0  0 10 10  0  7  0  0 10 10  0  7  9  9  0
  0  3  5  7  0  0  0  7  7  7  0  7  0  5  7  0  0  0  0  0  0  0  7  0
  7 10 10  7  0  0  0  0 10  5  1  7  0  0  7  7  0  0  7  0  0  0  9  0
  0  5  0  7  0  0  7  0  7  0  0  0  5  0  0  0  7  7  0 10  7 10  0  7
  0  0  5  0  0  7  0  5  0  0  0 10  0 10  7  0 10  0  0 10  7  0 10  0
  7  7  0  0  7  3  0  5  3  0  0  7  0  0  0  0  0 10  0  0  0  5 10  0
  1  0 10  0  0  0  0  0  0  5  0 10 10  0  0  0  7  5  6  7 10  6 10  0
 10  0  0  7 10 10  0  7 10  7  7  7  0  5  0  7  0