### Term Document Matrix

In [84]:
library('openNLP')
library('tm')
library('NLP')
library('dplyr')
library('data.table')
library('tmcn')  #text mining tool for Chinese
library('chinese.misc')
library('caret')
library('e1071')
library('pROC')
library('naivebayes')
getwd()
Sys.setlocale(category="LC_ALL",locale="chinese")

### Function Definitions

In [56]:
datapreprocess = function(corp_data, index_data){
    call1 = match.call()
    
    #id with freuqency of 1
    id_unique_list = c()
    tttable = table(index_data[,'doc_id'])

    for (i in 1:length(tttable)){
        if (tttable[[i]]==1){
            id_unique_list = c(id_unique_list, as.integer(names(tttable)[i]))
        }
    }
    
    df_train_index = data.frame(doc_id = id_unique_list, stringsAsFactors = FALSE)
    #train_doc_id = unique(index_data[,'doc_id'])
    
    #----create match id table
    topic = unique(index_data$topic)
    idlist = c()
    topiclist = c()
    num = 0
    for (i in topic){
        idlist = c(idlist, num)
        topiclist = c(topiclist, i)
        num = num +1
    }
    match = data.frame(id = idlist, topic = topiclist, stringsAsFactors = FALSE)
    
    #-------dataframe for document term matrix
    df_dtm_train = merge(x = df_train_index, y = corp_data, by = "doc_id", all.x = TRUE) 
    
    #matched table for training model
    index_data = merge(x = index_data, y=df_train_index, by ="doc_id")
    train_match = merge(x = index_data, y = match, by = "topic")   #topic doc_id id
    train_match = train_match[,c('doc_id','id')] 
    train_match = train_match[order(train_match$doc_id),]
    
    #----return
    list(call=call1, dtm = df_dtm_train, match = match, train_match=train_match)
}

In [57]:
train.mnb = function (dtm,labels){

    V <- ncol(dtm)
    N <- nrow(dtm)

    call = match.call()
    prior <- table(labels)/N
    labelnames <- names(prior)
    nclass <- length(prior)
    cond.probs <- matrix(nrow=V,ncol=nclass)  #create an empty matrix for condtional probability
    dimnames(cond.probs)[[1]] <- dimnames(dtm)[[2]]
    dimnames(cond.probs)[[2]] <- labelnames
    index <- list(length=nclass)
    for(j in 1:nclass){
        index[[j]] <- c(1:N)[labels == labelnames[j]]
    }

    for(i in 1:V){
        #flush.console()
        for(j in 1:nclass){
            cond.probs[i,j] <- (sum(dtm[index[[j]],i])+1)/(sum(dtm[index[[j]],])+V)
        }
    }
    list(call=call,prior=prior,cond.probs=cond.probs)
}

In [58]:
predict.mnb = function (model_train,test.dtm)
{
    classlabels <- dimnames(model_train$cond.probs)[[2]]
    logprobs <- as.matrix(test.dtm) %*% log(model_train$cond.probs)
    N <- nrow(test.dtm)
    nclass <- ncol(model_train$cond.probs)
    logprobs <- logprobs+matrix(nrow=N,ncol=nclass,log(model_train$prior),byrow=T)
    classlabels[max.col(logprobs)]
}

In [59]:
#implement by own
wholeprocess = function(df_corp, train_index, test_index, sparsity, en_zh)
{
    #call = match.call()
    datapre = datapreprocess(df_corp, train_index)
    #print(head(datapre$dtm))
    
    print('-----------Document Term Matrix Generating')
    Corpus = VCorpus(DataframeSource(datapre$dtm));  #summary(Corpus);
    if (en_zh == 'en'){
        TDM = DocumentTermMatrix(Corpus);
    }else{
        TDM = DocumentTermMatrix(Corpus, control = list(wordLengths=c(1, Inf)));
    }
    #remove terms with set sparsity
    dtm <- removeSparseTerms(TDM,sparsity)
    inspect(dtm)
    
    print(datapre$match[,'topic'])
    
    flush.console()
    
    print('-----------Naive Bayes Model Building')
    train_match = datapre$train_match
    labels <- train_match[,'id']
    model_train <- train.mnb(dtm,labels)
    print(model_train$prior)
    flush.console()

    names(model_train$prior) <- datapre$match[,'topic']
    #model_train$prior
    model_train$cond.probs = as.matrix(model_train$cond.probs)
    dimnames(model_train$cond.probs)[[2]] <- datapre$match[,'topic']
    
    flush.console()
    
    print('-----------Testing')
    print('-----------Document Term Matrix Generating')
    datapre_test = datapreprocess(df_corp, test_index)
    df_dtm_test = datapre_test$dtm

    Corpus_test = VCorpus(DataframeSource(df_dtm_test));  #summary(Corpus);
    test.dtm = DocumentTermMatrix(Corpus_test, list(dictionary=dimnames(dtm)[[2]]));
    inspect(test.dtm)
    
    flush.console()

    print('-----------Predicting')
    model_predict <- predict.mnb(model_train,test.dtm)

    #show id corresponded topic
    true_labels = merge(x=datapre_test$train_match, y=datapre_test$match, by="id", all.x = TRUE)
    true_labels = true_labels[order(true_labels$doc_id),]
    table(model_predict,true_labels[,'topic'])
    flush.console()
    
    print('-----------Evaluating')
    evalu = data.frame(pred = model_predict, true = true_labels[,'topic'], stringsAsFactors = TRUE)
    confusion = confusionMatrix(data = evalu[,'pred'],reference = evalu[,'true'])
}

In [87]:
#implement by packages
naive.bayes = function(df_corp, train_index, test_index){
    process = datapreprocess(df_corp, train_index)
    data_match = merge(x = process$train_match, y=process$match, by = "id", all.x = TRUE)
    data_match = data_match[,c('doc_id','topic')]

    #process$dtm
    Corpus = VCorpus(DataframeSource(process$dtm));  #summary(Corpus);
    TDM = DocumentTermMatrix(Corpus, control = list(stopwords=TRUE,
                                                   wordLengths=c(1, Inf)));
    #inspect(TDM)
    dtm <- removeSparseTerms(TDM,0.97)
    inspect(dtm)

    df = data.frame(as.matrix(dtm), stringsAsFactors=FALSE)
    df <- cbind(doc_id = rownames(df), df)
    rownames(df) <- 1:nrow(df)

    data_train = merge(x =df, y=data_match, by = "doc_id", all.x = TRUE)
    nb = naive_bayes(topic ~ ., data_train[,-1], laplace = 1)
    summary(nb)
    tables(nb, 1)

    process_test = datapreprocess(df_corp, test_index)
    data_match = merge(x = process_test$train_match, y=process_test$match, by = "id", all.x = TRUE)
    data_match = data_match[,c('doc_id','topic')]

    #process$dtm
    test.Corpus = VCorpus(DataframeSource(process_test$dtm));  #summary(Corpus);
    test.dtm = DocumentTermMatrix(test.Corpus, list(dictionary=dimnames(dtm)[[2]]));

    test.df = data.frame(as.matrix(test.dtm), stringsAsFactors=FALSE)
    test.df <- cbind(doc_id = rownames(test.df), test.df)
    rownames(test.df) <- 1:nrow(test.df)

    data_test = merge(x =test.df, y=data_match, by = "doc_id", all.x = TRUE)
    test_pred = data_test[,-dim(data_test)[2]]
    test_pred = test_pred[,-1]
    
    laplace1 = predict(nb, data_test, type="class")
    #table(laplace1, data_test$topic, dnn=c("predict", "actual"))
    evalu = data.frame(pred = laplace1, true = data_test$topic, stringsAsFactors = TRUE)
    confusion = confusionMatrix(data = evalu[,'pred'],reference = evalu[,'true'])
}

### Load data (training and testing)

In [8]:
df_corp=read.csv('wholedataset_en.csv')
Sys.setlocale(category="LC_ALL",locale="chinese")
df_corp_zh = data.table::fread('wholedataset_zh.csv')



### Naive Bayes Modeling 

# Experiment1: Dataset 1 (eight topics no balance)

### English Section

In [61]:
train_index1 = read.csv('Experimental dataset/dataset1train.csv')
test_index1 = read.csv('Experimental dataset/dataset1test.csv')
colnames(train_index1) <- c("doc_id", "topic")
colnames(test_index1) <- c("doc_id", "topic")

In [9]:
naivebayes_en1 = wholeprocess(df_corp, train_index1, test_index1, 0.97, 'en')

   doc_id
1 1001571
2 1005617
3 1005676
4 1005677
5 1005678
6 1005703
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

[1] "-----------Naive Bayes Model Building"
labels
         0          1          2          3          4          5          6 
0.38337445 0.02921529 0.21983535 0.08567786 0.03855642 0.02705325 0.02680378 
         7 
0.18948360 
[1] "-----------Testing"
[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 9042, terms: 491)>>
Non-/sparse entries: 315118/4124504
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company have market new say world
  1012631   41    1     1       0       2   17      6   4  16     3
  1016205    4    3     7       1       1    3      3   0   7     9
  1017942    7   22     2       0      10    5      6   5  12     0
  1020575   31    3     0       0       1    6     15   5   9     5
  1021081   13    0     0       0       0   10      6   3   1     4
  1022333    3   12     2       0       5    2     14   1  13     3
  10

In [20]:
naivebayes_en1

Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business       2395     165         48        22         20     117       80
  economy         201    1174         68         5          4      75      107
  Financials      123      97         77         1          9      27        5
  lifestyle       109      33          8       167         31       0       20
  management      142      37         22        15        260       4        4
  markets         229     169         32         4          0     541        8
  politics        172     213         14        10          8       8     1399
  society          88      97         10        20         19       1       86
            Reference
Prediction   society
  business        21
  economy         10
  Financials       2
  lifestyle       21
  management      26
  markets          0
  politics        43
  society        119

Overall Statistics
   

In [65]:
confusion1 = naive.bayes(df_corp, train_index1, test_index1)
confusion1

<<DocumentTermMatrix (documents: 36077, terms: 481)>>
Non-/sparse entries: 1193151/16159886
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company make market new say world
  1017029    7   12     1       0       4    6     29   5  12    13
  1017412    1    1     0       1      30    6      0   3  31     1
  1018031    0    2    28      45      18    5     64   6   1     3
  1020531    2    7     2       0       5    4      3   4  12     3
  1021616    3    1    30      14      22    6      6  15  12     5
  1023366    0    6    43      21      19    2     13   0   8     3
  1075495    6   14     4       0       1    1      7   3   8     1
  1078838   70   19     7       5       8    2      1   3  13     1
  1080351    7    9    10       4      19    3     11   4  21     3
  1081802    0   21    14      16       6    4      4   4  11     4

 
- Call: naive_bayes.formula(for

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."


Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business       1440     122          2        15         20      71       46
  economy         131     627          4         1          3      43       63
  Financials      165     151        267        17         46      27       11
  lifestyle       102      37          1        88         10       0       31
  management      154      43          4        10        106       9        9
  markets         573     531          1         5          6     563       19
  politics        398     301          0        14         16      30     1306
  society         496     173          0        94        144      30      224
            Reference
Prediction   society
  business        12
  economy          4
  Financials       2
  lifestyle       12
  management      13
  markets          5
  politics        29
  society        165

Overall Statistics
   

### Chinese Section

In [21]:
naivebayes_zh1 = wholeprocess(df_corp_zh, train_index1, test_index1, 0.97, 'zh')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 36077, terms: 444)>>
Non-/sparse entries: 1115548/14902640
Sparsity           : 93%
Maximal term length: 9
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      不 大 公司 经济 美国 市场 他 我 银行 中国
  1011946 22  8    4    8    3   15  4 28    1   73
  1013609 15  5    3    9    4    1 13  6    3    6
  1014006  8  5   24    2    6   14  4  2    3    1
  1017412 21  6   67    1   15    0 36  7    1    1
  1023366  6  2   32    3    4   17  3  7    0   58
  1075495  7  9    1   32    7    6  5  0    5    6
  1078838 10  8    7   12    1    1  5  5   75   21
  1078840 10  4    1   21    2    1  1  0    9   39
  1080040  6  3   64    2    2    7  2  0   15   32
  1080351  5  5   23    0    2   10  8  0   14   17
[1] "business"   "Financials" "economy"    "markets"    "management"
[6] "lifestyle"  "society"    "politics"  
[1] "-----------Naive Bayes Model Building"
labels
         0     

"longer object length is not a multiple of shorter object length"
"Levels are not in the same order for reference and data. Refactoring data to match."


In [22]:
naivebayes_zh1

Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business       2935    1069        108       209        311     470      802
  economy          53     472         59         3          4      40       48
  Financials        1       2          2         0          0       0        0
  lifestyle         0       0          0         0          0       0        0
  management        0       0          0         0          0       0        0
  markets         274     173         87         1          1     225       17
  politics        196     269         23        31         35      38      842
  society           0       0          0         0          0       0        0
            Reference
Prediction   society
  business       202
  economy         15
  Financials       0
  lifestyle        0
  management       0
  markets          2
  politics        23
  society          0

Overall Statistics
   

In [88]:
confusion1 = naive.bayes(df_corp_zh, train_index1, test_index1)
confusion1

<<DocumentTermMatrix (documents: 36077, terms: 444)>>
Non-/sparse entries: 1115548/14902640
Sparsity           : 93%
Maximal term length: 9
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      不 大 公司 经济 美国 市场 他 我 银行 中国
  1011946 22  8    4    8    3   15  4 28    1   73
  1013609 15  5    3    9    4    1 13  6    3    6
  1014006  8  5   24    2    6   14  4  2    3    1
  1017412 21  6   67    1   15    0 36  7    1    1
  1023366  6  2   32    3    4   17  3  7    0   58
  1075495  7  9    1   32    7    6  5  0    5    6
  1078838 10  8    7   12    1    1  5  5   75   21
  1078840 10  4    1   21    2    1  1  0    9   39
  1080040  6  3   64    2    2    7  2  0   15   32
  1080351  5  5   23    0    2   10  8  0   14   17

 
- Call: naive_bayes.formula(formula = topic ~ ., data = data_train[,      -1], laplace = 1) 
- Laplace: 1 
- Classes: 8 
- Samples: 36077 
- Features: 444 
- Conditional distributions: 
    - Gaussian: 444
- Prior probabilit

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."


Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business         69       7         12         0          0       3        0
  economy           0       0          3         0          0       0        0
  Financials        0       0          0         0          0       0        0
  lifestyle        36      30          8         2          1       8        1
  management       43      11          7         1          4       0        4
  markets          23      43         30         0          0      40        1
  politics         94      94         29         8          5      17      356
  society        3194    1800        190       233        341     705     1347
            Reference
Prediction   society
  business         0
  economy          0
  Financials       0
  lifestyle        0
  management       0
  markets          0
  politics         7
  society        235

Overall Statistics
   

# Experiment2: Dataset 2 (eight topics with balanced distribution)

In [89]:
train_index2 = read.csv('Experimental dataset/dataset2train.csv')
test_index2 = read.csv('Experimental dataset/dataset2test.csv')
colnames(train_index2) <- c("doc_id", "topic")
colnames(test_index2) <- c("doc_id", "topic")

### English Section

In [24]:
naivebayes_en2 = wholeprocess(df_corp, train_index2, test_index2, 0.97, 'en')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 7680, terms: 661)>>
Non-/sparse entries: 371052/4705428
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china company have make market new say world
  1010183    0    6     0       3   10    7      4   9   8     1
  1014871   23    2     0       1    7    6      9   5  11     1
  1016205    4    3     7       1    3    1      3   0   7     9
  1017029    7   12     1       4    8    6     29   5  12    13
  1018031    0    2    28      18   12    5     64   6   1     3
  1019174   15    0    16       1   10    5      0   5   9     4
  1020575   31    3     0       1    6    7     15   5   9     5
  1024925    4    0     0      39   10    8      5   4   9     1
  1089072    0   16     9       2   10    8      3   2  13     3
  1090357    2    0     7       6   24    5     12   1   3     5
[1] "Financials" "eco

In [25]:
naivebayes_en2

Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business        160      10         35        10         13      24       11
  economy          10     129         43         5          1      25       21
  Financials       18      22         87         5         12      15        2
  lifestyle        10       5         10       168         20       1        6
  management        9       3         22        13        180       1        0
  markets          18      26         30         7          0     174        0
  politics         10      28          7         9          3       1      188
  society           7      18          5        25         11       1       14
            Reference
Prediction   society
  business         7
  economy          6
  Financials       2
  lifestyle       31
  management      16
  markets          4
  politics        34
  society        142

Overall Statistics
   

In [90]:
confusion2 = naive.bayes(df_corp, train_index2, test_index2)
confusion2

<<DocumentTermMatrix (documents: 7680, terms: 660)>>
Non-/sparse entries: 367184/4701616
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china company market mr new say us world
  1010183    0    6     0       3      4  0   9   8  0     1
  1014871   23    2     0       1      9  3   5  11  3     1
  1016205    4    3     7       1      3  2   0   7 21     9
  1017029    7   12     1       4     29  4   5  12 29    13
  1018031    0    2    28      18     64  0   6   1  5     3
  1019174   15    0    16       1      0 13   5   9  5     4
  1020575   31    3     0       1     15  1   5   9  3     5
  1024925    4    0     0      39      5  2   4   9  7     1
  1089072    0   16     9       2      3  5   2  13  1     3
  1090357    2    0     7       6     12  0   1   3  8     5

 
- Call: naive_bayes.formula(formula = topic ~ ., data = data_train[,      -1], laplace = 1) 
- Laplace: 1 
- Cl

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."


Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business         93      18          1        17         20      20        6
  economy           8      49          0         4          0      17        9
  Financials       11      17        229        22         18       6        1
  lifestyle         4       3          3        79         10       0        5
  management        6       3          5         7         85       0        2
  markets          50      86          0         9          6     183        9
  politics         48      45          1        33         21      12      190
  society          22      20          0        71         80       4       20
            Reference
Prediction   society
  business         7
  economy          6
  Financials       3
  lifestyle       18
  management       7
  markets          4
  politics        62
  society        135

Overall Statistics
   

### Chinese Section

In [26]:
naivebayes_zh2 = wholeprocess(df_corp_zh, train_index2, test_index2, 0.97, 'zh')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 7680, terms: 601)>>
Non-/sparse entries: 329352/4286328
Sparsity           : 93%
Maximal term length: 9
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      不 公司 经济 美国 人 市场 他 我 我们 中国
  1007671 14    1   14    1  3    0 42  4    7    4
  1010183  7    5    1    1  2    5  6  0    3    0
  1011946 22    4    8    3 15   15  4 28    5   73
  1014871  7    5    0    3  8    9  8  0    8    0
  1016205 13    1   18   21  6    3  9  1    2   11
  1020575  9    3    2   10  3   20  1  0    6    0
  1024925  9   20    3    7  4    6  1  1    1    0
  1080040  6   64    2    2  2    7  2  0    1   32
  1089072  7    3   16    0  9    2  8  7    2   29
  1090357 12    5    7    8 12   13 13 21   32   10
[1] "Financials" "economy"    "management" "business"   "lifestyle" 
[6] "markets"    "society"    "politics"  
[1] "-----------Naive Bayes Model Building"
labels
        0         

In [27]:
naivebayes_zh2

Confusion Matrix and Statistics

            Reference
Prediction   business economy Financials lifestyle management markets politics
  business         91      33         49        55         71      43       34
  economy           3      49         30         3          6      12        5
  Financials        6      11         18         2          1       4        1
  lifestyle        16       6          2        25          1       2        3
  management        3       2         11         9         77       0        5
  markets          45      35         98         8          7     127        8
  politics         48      77         29        94         51      37      161
  society          30      28          2        46         26      17       25
            Reference
Prediction   society
  business        64
  economy         11
  Financials       1
  lifestyle        3
  management       9
  markets         11
  politics        95
  society         48

Overall Statistics
   

# Experiment3: Dataset 3 (four topics no balance)

In [91]:
train_index3 = read.csv('Experimental dataset/dataset3train.csv')
test_index3 = read.csv('Experimental dataset/dataset3test.csv')
colnames(train_index3) <- c("doc_id", "topic")
colnames(test_index3) <- c("doc_id", "topic")

### English Section

In [29]:
naivebayes_en3 = wholeprocess(df_corp, train_index3, test_index3, 0.97, 'en')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 31718, terms: 453)>>
Non-/sparse entries: 989532/13378722
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company have market new say world
  1017328    4    2     0       0      17    3      5   1  12     5
  1017412    1    1     0       1      30   19      0   3  31     1
  1020531    2    7     2       0       5    6      3   4  12     3
  1021616    3    1    30      14      22   11      6  15  12     5
  1023366    0    6    43      21      19    8     13   0   8     3
  1078153    4    6    22      18      20    7      2   1   7     6
  1078838   70   19     7       5       8    3      1   3  13     1
  1079314    1    0    11       7       4    5      4   4  10     4
  1080351    7    9    10       4      19   13     11   4  21     3
  1082731    0    5    15       2      13   12      2 

In [30]:
naivebayes_en3

Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business     2750     254     118       90
  economy       262    1296     112      125
  markets       232     168     528        9
  politics      216     270      15     1485

Overall Statistics
                                          
               Accuracy : 0.7641          
                 95% CI : (0.7546, 0.7734)
    No Information Rate : 0.4363          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.6634          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                   0.7948         0.6519        0.68305
Specificity                   0.8966         0.9160        0.94285
Pos Pred Value                0.8562         0.7220        0.56350
Neg Pred Value          

In [93]:
confusion3 = naive.bayes(df_corp, train_index3, test_index3)
confusion3

<<DocumentTermMatrix (documents: 31718, terms: 452)>>
Non-/sparse entries: 989770/13346766
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company market mr say us world
  1017412    1    1     0       1      30      0 26  31  7     1
  1018594    3    4    25      11       8      3  8  11  7     8
  1020531    2    7     2       0       5      3  4  12  1     3
  1021616    3    1    30      14      22      6  0  12  1     5
  1023366    0    6    43      21      19     13  4   8  4     3
  1078153    4    6    22      18      20      2 11   7  4     6
  1078838   70   19     7       5       8      1  9  13  1     1
  1079314    1    0    11       7       4      4  6  10  2     4
  1080351    7    9    10       4      19     11  9  21  1     3
  1082731    0    5    15       2      13      2  0  13  2     1

 
- Call: naive_bayes.formula(formula = topic ~ ., data = data_trai

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."


Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business     1957     194      90       65
  economy       242     806      52      101
  markets       547     556     591       32
  politics      714     432      40     1511

Overall Statistics
                                          
               Accuracy : 0.6135          
                 95% CI : (0.6027, 0.6242)
    No Information Rate : 0.4363          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4781          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                   0.5656         0.4054        0.76455
Specificity                   0.9219         0.9335        0.84141
Pos Pred Value                0.8487         0.6711        0.34241
Neg Pred Value          

### Chinese Section

In [31]:
naivebayes_zh3 = wholeprocess(df_corp_zh, train_index3, test_index3, 0.97, 'zh')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 31718, terms: 409)>>
Non-/sparse entries: 893313/12079349
Sparsity           : 93%
Maximal term length: 9
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      不 公司 经济 美国 其 市场 他 银行 政府 中国
  1012631  5    3    1    0  8    9  5   65    0    1
  1017384  9    3   15    4  1    2  4    0   10   42
  1017412 21   67    1   15 21    0 36    1   11    1
  1018594  1    3   27   10  6    1 10    4    6   54
  1020531  8   13   10    4  6    4 15    3   15    2
  1023366  6   32    3    4  4   17  3    0    2   58
  1078153  5   15    2    5  2    2  5    4    0   71
  1078838 10    7   12    1 10    1  5   75    4   21
  1080351  5   23    0    2  9   10  8   14    4   17
  1082731 14    2   19    2  4    2 22    0    3   42
[1] "business" "economy"  "markets"  "politics"
[1] "-----------Naive Bayes Model Building"
labels
         0          1          2          3 
0.43634529 0.2

In [32]:
naivebayes_zh3

Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business     2921     976     441      792
  economy        86     568      70       45
  markets       243     144     222        5
  politics      210     300      40      867

Overall Statistics
                                          
               Accuracy : 0.5773          
                 95% CI : (0.5663, 0.5882)
    No Information Rate : 0.4363          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3471          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                   0.8442        0.28571        0.28719
Specificity                   0.5058        0.96617        0.94523
Pos Pred Value                0.5694        0.73862        0.36156
Neg Pred Value          

# Experiment4: Dataset 4 (four topics with balanced distribution)

In [94]:
train_index4 = read.csv('Experimental dataset/dataset4train.csv')
test_index4 = read.csv('Experimental dataset/dataset4test.csv')
colnames(train_index4) <- c("doc_id", "topic")
colnames(test_index4) <- c("doc_id", "topic")

### English Section

In [34]:
naivebayes_en4 = wholeprocess(df_corp, train_index4, test_index4, 0.97, 'en')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 12368, terms: 453)>>
Non-/sparse entries: 390550/5212154
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company have market new say world
  1007733    0   11     0       0       4    4      3   0  16     1
  1014503    8    0     0       1       0   15     16   6   6     3
  1017384    1    0    31      11       2    7      2  11   2     9
  1017412    1    1     0       1      30   19      0   3  31     1
  1018649    0    0    27      13       9   11      1   3   1     2
  1020531    2    7     2       0       5    6      3   4  12     3
  1021081   13    0     0       0       0   10      6   3   1     4
  1078840    6   22    14       1       0    7      1   0   2     2
  1083754    0    3    10       8      25    4      2   5  16     0
  1084558   18   17     1       1       0    9      3  

In [35]:
naivebayes_en4

Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business      537     114     100       44
  economy        73     485      90       53
  markets        86      76     572        6
  politics       77      98      11      670

Overall Statistics
                                          
               Accuracy : 0.7322          
                 95% CI : (0.7162, 0.7478)
    No Information Rate : 0.25            
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.6429          
                                          
 Mcnemar's Test P-Value : 4.112e-06       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                   0.6947         0.6274         0.7400
Specificity                   0.8887         0.9069         0.9276
Pos Pred Value                0.6755         0.6919         0.7730
Neg Pred Value          

In [96]:
confusion4 = naive.bayes(df_corp, train_index4, test_index4)
confusion4

<<DocumentTermMatrix (documents: 12368, terms: 452)>>
Non-/sparse entries: 390394/5199942
Sparsity           : 93%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      bank cent china chinese company market mr say us world
  1007733    0   11     0       0       4      3 13  16  8     1
  1014503    8    0     0       1       0     16  0   6  2     3
  1017384    1    0    31      11       2      2  8   2  2     9
  1017412    1    1     0       1      30      0 26  31  7     1
  1018649    0    0    27      13       9      1  0   1  4     2
  1020531    2    7     2       0       5      3  4  12  1     3
  1021081   13    0     0       0       0      6  0   1  6     4
  1078840    6   22    14       1       0      1  1   2  3     2
  1083754    0    3    10       8      25      2  0  16  3     0
  1084558   18   17     1       1       0      3  8   7  0     3

 
- Call: naive_bayes.formula(formula = topic ~ ., data = data_train

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."


Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business      432     116     122       37
  economy        52     306      49       40
  markets        98     210     571       13
  politics      191     141      31      683

Overall Statistics
                                          
               Accuracy : 0.6442          
                 95% CI : (0.6271, 0.6611)
    No Information Rate : 0.25            
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5257          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                   0.5589        0.39586         0.7387
Specificity                   0.8814        0.93920         0.8616
Pos Pred Value                0.6110        0.68456         0.6401
Neg Pred Value          

### Chinese Section

In [36]:
naivebayes_zh4 = wholeprocess(df_corp_zh, train_index4, test_index4, 0.97, 'zh')

[1] "-----------Document Term Matrix Generating"
<<DocumentTermMatrix (documents: 12368, terms: 399)>>
Non-/sparse entries: 346691/4588141
Sparsity           : 93%
Maximal term length: 9
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      不 大 公司 经济 美国 全球 市场 他 政府 中国
  1014503 11  2    2   11    6    0   12 42    5    1
  1017384  9  4    3   15    4    3    2  4   10   42
  1017412 21  6   67    1   15    2    0 36   11    1
  1018649  6  3    7   10    4    4    1  0    8   44
  1020531  8  5   13   10    4    0    4 15   15    2
  1021081 10  6    0    5    6    0    7  9    2    0
  1021465  9  2    1    5    5    5    4  1    8    0
  1024593  6  1    4    9    0    5   11  0    0    0
  1078840 10  4    1   21    2    3    1  1    4   39
  1083754  4  4   41    3    2    1    4  0    5   25
[1] "business" "economy"  "markets"  "politics"
[1] "-----------Naive Bayes Model Building"
labels
   0    1    2    3 
0.25 0.25 0.25 0.25 
[1] "-----------Te

In [37]:
naivebayes_zh4

Confusion Matrix and Statistics

          Reference
Prediction business economy markets politics
  business      302     124      80       92
  economy        89     289      80      121
  markets       239     175     515       89
  politics      143     185      98      471

Overall Statistics
                                          
               Accuracy : 0.51            
                 95% CI : (0.4922, 0.5278)
    No Information Rate : 0.25            
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3467          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: business Class: economy Class: markets
Sensitivity                  0.39069        0.37387         0.6662
Specificity                  0.87236        0.87495         0.7831
Pos Pred Value               0.50502        0.49914         0.5059
Neg Pred Value          