In [12]:
from pathlib import Path
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

data_raw = Path('mini_newsgroups')
tag_raw = list(data_raw.glob('*/*'))
tag_raw = [str(name) for name in tag_raw]

def path2news(path):
    path_list = path.split('/')
    return path_list[1]

news_labels = list(map(lambda x: path2news(x), tag_raw))

news_to_index = dict((name, index) for index, name in enumerate(list(set(news_labels))))

news_targets = [news_to_index[name] for name in news_labels]

# # 取文件夹的名字作为标签
# tag = []
# i = 0
# while i < len(tag_raw) :
#     name = tag_raw[i].split('/')
#     tag.append(name[1])
#     i+=100

# # 给tag打上数字标签
# tag_to_index = dict((name,index) for index, name in enumerate(tag))

# news_targets = [tag_to_index[name] for name in news_labels]

# 对新闻数据进行处理
def newsSolve(path_news):
    file_raw = open(path_news, 'r', encoding='ISO-8859-1')
    
    content_raw = file_raw.read()

    # 去除换行
    content_raw = re.sub('\n', ' ', content_raw)
    content_raw = re.sub('\d+', ' ', content_raw)

    # 去除常见字符
    chars_common = "[\!\:\<\>\...\-\'\)\(\/_,$%^*(+\"\']+|[+——! ，：。；、~@#¥%……&*（）]+"
    content_raw = re.sub(chars_common, ' ', content_raw)

    # 首先进行分句
    sentence_content_raw = nltk.sent_tokenize(content_raw)

    # 再对每句进行分词
    word = []
    for sentence in sentence_content_raw :
        word_list = nltk.word_tokenize(sentence)
        word.extend(word_list)

    return word

news_split = list(map(lambda x: newsSolve(x), tag_raw))

stop_words = list(ENGLISH_STOP_WORDS)
for x in list(nltk.corpus.stopwords.words('english')):
    if x not in stop_words:
        stop_words.append(x)

news_split_2 = []
for words in news_split:
    cur_words = list(map(lambda x: x if x not in stop_words else "", words))
    news_split_2.append(" ".join(cur_words))

# 五重交叉
news_fivecross = []
tar_fivecross = []
for i in range(5):
    for j in range(20):
        for k in range(i * 20 + j * 100, (i + 1) * 20 + j * 100):
            news_fivecross.append(news_split_2[k])
            tar_fivecross.append(news_targets[k])

news_fivecross.extend(news_fivecross)
tar_fivecross.extend(tar_fivecross)

res_fivecross = []
for i in range(5):
    vector = TfidfVectorizer(stop_words=stop_words, decode_error='ignore')

    x_test = news_fivecross[i * 400: i * 400 + 400]
    y_test = tar_fivecross[i * 400: i * 400 + 400]
    x_train = news_fivecross[i * 400 + 400: i * 400 + 2000]
    y_train = tar_fivecross[i * 400 + 400: i * 400 + 2000]

    train_vec = vector.fit_transform(x_train)
    test_vec = vector.transform(x_test)

    svc = LinearSVC()
    svc.fit(train_vec, y_train)
    res = svc.predict(test_vec)
    res_fivecross.append(f1_score(y_test, res, average='macro'))
    
# print(res_fivecross)

# print("final result: {:f}".format(sum(five_cross_results) / 5))

y = svc.predict(test_vec)
print(classification_report(y_test, y, target_names=news_to_index.keys()))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.80      0.82        20
comp.sys.ibm.pc.hardware       0.78      0.90      0.84        20
   talk.politics.mideast       0.90      0.95      0.93        20
               sci.crypt       0.95      0.95      0.95        20
      talk.politics.misc       0.80      0.60      0.69        20
               rec.autos       1.00      0.80      0.89        20
      rec.sport.baseball       1.00      1.00      1.00        20
                 sci.med       0.86      0.95      0.90        20
         rec.motorcycles       0.87      1.00      0.93        20
   comp.sys.mac.hardware       0.91      1.00      0.95        20
  soc.religion.christian       1.00      1.00      1.00        20
         sci.electronics       0.94      0.80      0.86        20
 comp.os.ms-windows.misc       0.86      0.95      0.90        20
            misc.forsale       0.91      1.00      0.95        20
         