In [0]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import time
from sklearn import svm
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score
import nltk
from nltk.corpus import stopwords
import xgboost as xgb

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
train_data = []  # 16659
test_data = []  # 810

In [4]:
'''读取文件'''
print('Reading files...')
print('\n')
train_f = open("train.txt", encoding='utf-8')
for line in train_f:
    train_data.append(line)
train_f.close()

test_f = open("test.txt", encoding='utf-8')
for line in test_f:
    test_data.append(line)
test_f.close()
print(train_data[0])
print(test_data[0])
print('\n')
print('Files get!...')

Reading files...


" 5 . Science includes such diverse fields as astronomy , biology , computer sciences , geology , logic , physics , chemistry , and mathematics ( [ link ] ) ."	"0"

" 2 . It becomes clear from this definition that the application of the scientific method plays a major role in science ."	"0"



Files get!...


In [5]:
'''去掉开头序号-> [[句子(单词间以,为分割)], 0/1]'''
print('Formatting...')
print('\n')
train_set = []
for unit in train_data:
    temp = unit.split(' ')
    if temp[1].isdigit():
        # 判断第一个字符是否为数字，如果是则去掉，如果不是则取句子
        temp = temp[3:-1]
        # 当前temp中的最后一部分为"\t.\n","最后一个引号" 为了保证格式规范，先去掉，最后在取corpus的时候加上句号即可
        # training_set.append(([temp, unit[-3:-2]]))
        train_set.append([' '.join(temp), unit[-3:-2]])
        # unit[-3:-2]为当前句子的label
    else:
        # temp[1]为引号，应去掉
        temp = temp[1:-1]
        # training_set.append(([temp, unit[-3:-2]]))
        train_set.append([' '.join(temp), unit[-3:-2]])

test_set = []
for unit in test_data:
    temp = unit.split(' ')
    if temp[1].isdigit():
        temp = temp[3:-1]
        # training_set.append(([temp, unit[-3:-2]]))
        test_set.append([' '.join(temp), unit[-3:-2]])
    else:
        temp = temp[1:-1]
        # training_set.append(([temp, unit[-3:-2]]))
        test_set.append([' '.join(temp), unit[-3:-2]])

print(train_set[0])
print(test_set[0])
print('\n')
print('All data sets are formatted!')

Formatting...


['Science includes such diverse fields as astronomy , biology , computer sciences , geology , logic , physics , chemistry , and mathematics ( [ link ] )', '0']
['It becomes clear from this definition that the application of the scientific method plays a major role in science', '0']


All data sets are formatted!


In [6]:
print('Tokenizeing...')
print('\n')

def tokenize(data):
    res = []
    for samples in data:
        # nltk.word_tokenize用于取tokens
        temp_t = nltk.word_tokenize(samples[0])
        res.append([temp_t, samples[1]])
    return res


train_set = tokenize(train_set)
test_set = tokenize(test_set)
print(train_set[0])
print(test_set[0])
print('\n')
print('Tokenization completed!')

Tokenizeing...


[['Science', 'includes', 'such', 'diverse', 'fields', 'as', 'astronomy', ',', 'biology', ',', 'computer', 'sciences', ',', 'geology', ',', 'logic', ',', 'physics', ',', 'chemistry', ',', 'and', 'mathematics', '(', '[', 'link', ']', ')'], '0']
[['It', 'becomes', 'clear', 'from', 'this', 'definition', 'that', 'the', 'application', 'of', 'the', 'scientific', 'method', 'plays', 'a', 'major', 'role', 'in', 'science'], '0']


Tokenization completed!


In [7]:
print('Removing stopwords...')
print('\n')
stop_words = set(stopwords.words("english"))
# stop_words为英文所有stop words的集合


def Remove_stopwords(data):
    res = []
    temp = []
    for sets in data:
        for w in sets[0]:
            # 如果对于句子中任意一个单词不属于stop_words则将其加入新的dataset
            if w not in stop_words:
                temp.append(w)
        res.append([temp, sets[1]])
        temp = []

    return res


train_set = Remove_stopwords(train_set)
test_set = Remove_stopwords(test_set)
print(train_set[0])
print(test_set[0])
print('\n')
print('Stopwords removed!')

Removing stopwords...


[['Science', 'includes', 'diverse', 'fields', 'astronomy', ',', 'biology', ',', 'computer', 'sciences', ',', 'geology', ',', 'logic', ',', 'physics', ',', 'chemistry', ',', 'mathematics', '(', '[', 'link', ']', ')'], '0']
[['It', 'becomes', 'clear', 'definition', 'application', 'scientific', 'method', 'plays', 'major', 'role', 'science'], '0']


Stopwords removed!


In [8]:
print('Stemming...')
print('\n')
ps = PorterStemmer()


def Stemming(data):
    res = []
    stemmed_words_temp = []
    for units in data:
        for w in units[0]:
            #ps.stem(w):对当前单词w取stem word
            stemmed_words_temp.append(ps.stem(w))

        res.append([stemmed_words_temp, units[1]])
        stemmed_words_temp = []
    return res


train_set = Stemming(train_set)
test_set = Stemming(test_set)
print(train_set[0])
print(test_set[0])
print('\n')
print('Stemming completed!')

Stemming...


[['scienc', 'includ', 'divers', 'field', 'astronomi', ',', 'biolog', ',', 'comput', 'scienc', ',', 'geolog', ',', 'logic', ',', 'physic', ',', 'chemistri', ',', 'mathemat', '(', '[', 'link', ']', ')'], '0']
[['It', 'becom', 'clear', 'definit', 'applic', 'scientif', 'method', 'play', 'major', 'role', 'scienc'], '0']


Stemming completed!


In [9]:
print('Lemmatization...')
print('\n')
lem = WordNetLemmatizer()
stem = PorterStemmer()


def Lemmatization(data):
    lem_words_temp = []
    res = []
    for atoms in data:
        for w in atoms[0]:
            #lem.lemmatize(w, "v"): 对当前单词w取Lemmatization
            lem_words_temp.append(lem.lemmatize(w, "v"))
        res.append([lem_words_temp, atoms[1]])
        lem_words_temp = []

    return res


train_set = Lemmatization(train_set)
test_set = Lemmatization(test_set)
print(train_set[0])
print(test_set[0])
print('\n')
print('Lemmatization completed!')

Lemmatization...


[['scienc', 'includ', 'divers', 'field', 'astronomi', ',', 'biolog', ',', 'comput', 'scienc', ',', 'geolog', ',', 'logic', ',', 'physic', ',', 'chemistri', ',', 'mathemat', '(', '[', 'link', ']', ')'], '0']
[['It', 'becom', 'clear', 'definit', 'applic', 'scientif', 'method', 'play', 'major', 'role', 'scienc'], '0']


Lemmatization completed!


In [10]:
print('Corpus and labels gathering...')
print('\n')
corpus_test = []
corpus_train = []


def Gathering_corpus(data):
    res = []
    temp_res = []
    for aas in data:
        #corpus中仅仅需要第一个unit，即句子
        temp_res.append(aas[0])

    for aaas in temp_res:
        # 对于scikit-learn中的vectorizer，corpus必须为一句一句话的集合，而并非tokens。所以这里必须join然后加上句号。
        temp_ff = ' '.join(aaas)
        temp_ff = temp_ff + '.'
        res.append(temp_ff)
    return res


corpus_train = Gathering_corpus(train_set)
corpus_test = Gathering_corpus(test_set)
# train和test的corpus之和，成为corpus_sum
corpus_sum = []
corpus_sum = corpus_train
for things in corpus_test:
    corpus_sum.append(things)
print('len of corpus_sum:', len(corpus_sum))
print(corpus_sum[:5])
#取出label: Y_train and Y_test
Y_train = []
for thing in train_set:
    Y_train.append(thing[1])
Y_train = np.array(Y_train)
Y_test = []
for that in test_set:
    Y_test.append(that[1])
Y_test = np.array(Y_test)
print(Y_train[:5])
print(Y_test[:5])
print(Y_train.shape)
print(Y_test.shape)
print('\n')
print('Corpus and labels get!')

Corpus and labels gathering...


len of corpus_sum: 17469
['scienc includ divers field astronomi , biolog , comput scienc , geolog , logic , physic , chemistri , mathemat ( [ link ] ).', 'howev , field scienc relat physic world phenomena process consid natur scienc.', 'thu , museum natur scienc might contain item list.', 'In deduct reason , pattern think move opposit direct compar induct reason.', 'deduct reason form logic think use gener principl law forecast specif result.']
['0' '1' '0' '0' '1']
['0' '1' '0' '1' '1']
(16659,)
(810,)


Corpus and labels get!


In [11]:
print('tfidf processing...')
print('\n')
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(corpus_sum)
X_train_tfidf = X_tfidf[:16659]
X_test_tfidf = X_tfidf[16659:17469]
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()
X_tfidf_arr = X_tfidf.toarray()
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(X_tfidf_arr.shape)
print('\n')
print('tfidf get!')

tfidf processing...


(16659, 14675)
(810, 14675)
(17469, 14675)


tfidf get!


In [0]:
def Format_into_array(data):
    res = []
    for items in data:
        temp = int(items)
        res.append(temp)
    res = np.array(res)
    
    return res

In [13]:
Y_test = Format_into_array(Y_test)
Y_train = Format_into_array(Y_train)
print(Y_test[:10])
print(Y_train[:10])

[0 1 0 1 1 1 0 0 1 0]
[0 1 0 0 1 0 0 1 1 1]


In [0]:
dtrain=xgb.DMatrix(X_train_tfidf,label=Y_train)
dtest=xgb.DMatrix(X_test_tfidf, label=Y_test)

In [0]:
params={'booster':'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth':35,
        'lambda':15,
        'subsample':0.75,
        'colsample_bytree':0.75,
        'min_child_weight':1.75,
        'eta': 0.025,
        'seed':0,
        'silent':1,
        'gamma':0.15,
        'learning_rate' : 0.015}

In [0]:
watchlist = [(dtrain,'train'), (dtest,'test')]

In [17]:
bst=xgb.train(params,dtrain,num_boost_round=350,evals=watchlist)

[0]	train-auc:0.659876	test-auc:0.635422
[1]	train-auc:0.709437	test-auc:0.663788
[2]	train-auc:0.732669	test-auc:0.703501
[3]	train-auc:0.738845	test-auc:0.709872
[4]	train-auc:0.744975	test-auc:0.72287
[5]	train-auc:0.748897	test-auc:0.723754
[6]	train-auc:0.751152	test-auc:0.731294
[7]	train-auc:0.752637	test-auc:0.729439
[8]	train-auc:0.753851	test-auc:0.731663
[9]	train-auc:0.756842	test-auc:0.734108
[10]	train-auc:0.757999	test-auc:0.734118
[11]	train-auc:0.760306	test-auc:0.734292
[12]	train-auc:0.760681	test-auc:0.733474
[13]	train-auc:0.761489	test-auc:0.734944
[14]	train-auc:0.766177	test-auc:0.739279
[15]	train-auc:0.767752	test-auc:0.740984
[16]	train-auc:0.767748	test-auc:0.740629
[17]	train-auc:0.768281	test-auc:0.744207
[18]	train-auc:0.769746	test-auc:0.743238
[19]	train-auc:0.770322	test-auc:0.743389
[20]	train-auc:0.770372	test-auc:0.745029
[21]	train-auc:0.773578	test-auc:0.740848
[22]	train-auc:0.773734	test-auc:0.740281
[23]	train-auc:0.775342	test-auc:0.739719
[24

In [0]:
def Curve_for_hat(data, cutoff):
    res = []
    for items in data:
        if items >= cutoff:
            temp = 1
        else:
            temp = 0
        res.append(temp)
    res = np.array(res)
    
    return res

In [0]:
Y_hat=bst.predict(dtest)

In [0]:
Y_hat = Curve_for_hat(Y_hat,0.3)

In [29]:
print('Scores of XGB:\n')
print('F1 score:', f1_score(Y_test, Y_hat, average='binary', pos_label=1))
print('Precision:', precision_score(Y_test, Y_hat, average='binary', pos_label=1))
print('Recall:', recall_score(Y_test, Y_hat, average='binary', pos_label=1))
print('Accuracy:', accuracy_score(Y_test, Y_hat))

Scores of XGB:

F1 score: 0.6226415094339622
Precision: 0.5454545454545454
Recall: 0.7252747252747253
Accuracy: 0.7037037037037037
