In [1]:
import pandas as pd
import json
import os

from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from lightgbm import LGBMClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [2]:
dataset_dir = './train/original-microblog/'
data_list = []
for data in os.listdir(dataset_dir):
    js = json.load(open(os.path.join(dataset_dir, data), encoding='utf-8'))
    t = pd.json_normalize(js, sep='_')
    data_list.append(t)
df = pd.concat(data_list, axis=0, ignore_index=True)
non_rumors_dir = './train/non-rumor-repost'
rumors_dir = './train/rumor-repost'
for data in os.listdir(non_rumors_dir):
    mid = data.split('_')[0]
    df.loc[df['mid'] == mid, 'is_rumor'] = 0
for data in os.listdir(rumors_dir):
    mid = data.split('_')[0]
    df.loc[df['mid'] == mid, 'is_rumor'] = 1

In [3]:
train = df[['mid', 'text', 'is_rumor']]
train['is_rumor'] = train['is_rumor'].astype('int64')
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['is_rumor'] = train['is_rumor'].astype('int64')


Unnamed: 0,mid,text,is_rumor
0,1U6k8lMp1y,【为了孩子，请拒吃转基因食品！】印度某地的羊吃了棉花收获后的草杆后，四分之一的羊死了，约5到...,1
1,2dKrkwjMIHC,【中国缺少这样的少年英雄。应进中学语文课本】因拆迁款等问题与家人上访的16岁少年赵明阳将一名...,0
2,2dKrm0nZnZH,期待这一天的到来#元畅依晨#隐婚三年终于曝光。。。。。哈哈哈哈哈哈元畅依晨你们神马都不用说、...,1
3,2dKs0ywFoob,懂得沙士比亞的計程車司機,0
4,2dKsGplwDPR,【视频：陈佩斯主持《焦点访谈》】其实我们不知道，早期的《焦点访谈》曾经是由陈佩斯主持的，一开...,1
...,...,...,...
4221,zzLQSkl9W,【四川绵阳市越狱46人 45男 1女 大多为90后 男性大多为强奸犯 女为杀人犯 南山下面一...,1
4222,zzLSl4IL0,【“呵呵”——最恶心的聊天词汇，你中枪了么？】有人说，所有词语里最恶心的、最伤人的莫过于“呵...,0
4223,zznk2uq5w,年轻时无论如何要确立一个目标。一克服懒惰消极靠目标。没目标就瞎混，一天到晚打牌玩游戏睡懒觉。...,0
4224,zzTDJu6C4,牙膏底部的短线，绿色天然的。蓝色天然加药物，红色药物家化学，黑色纯化学。童鞋们~今后购买要记...,1


In [4]:
from tqdm import tqdm
rumors_dir = './train/rumor-repost'
non_rumors_dir = './train/non-rumor-repost'
for data in os.listdir(non_rumors_dir):
    js = json.load(open(os.path.join(non_rumors_dir, data), encoding='utf-8'))
    json_df = pd.json_normalize(js)
    json_df = json_df[~json_df['text'].str.contains('转发') & (json_df['text'] != '')]
    new_text = json_df['text'].str.cat(sep=' ')
    train.loc[train['mid'] == data.split('_')[0], 'text'] += new_text
for data in os.listdir(rumors_dir):
    js = json.load(open(os.path.join(rumors_dir, data), encoding='utf-8'))
    json_df = pd.json_normalize(js)
    json_df = json_df[~json_df['text'].str.contains('转发') & (json_df['text'] != '')]
    new_text = json_df['text'].str.cat(sep=' ')
    train.loc[train['mid'] == data.split('_')[0], 'text'] += new_text


In [5]:
def get_stopword():
    stopwords = []
    with open('baidu_stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            stopwords.append(line.strip())
   
    stopwords = list(set(stopwords))
    return stopwords


In [6]:
import jieba
import re

def wordopt_cn(text):
    # only chinese characters
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese_txt = re.sub(pattern, '', text)
    return str(chinese_txt)


stopwords = get_stopword()
stopwords.append('年')
stopwords.append('月')
stopwords.append('日')
stopwords.append('日日')
stopwords.append('年月')
stopwords.append('月日')
stopwords.append('年月日')
stopwords.append('回复')
stopwords.append('关注')

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)


train_copy = train.copy()
train_copy['text'] = train_copy['text'].apply(wordopt_cn)
train_copy['text'] = train_copy['text'].apply(lambda x: ' '.join(jieba.cut(x)))
train_copy['text'] = train_copy['text'].apply(remove_stopwords)
train_copy['text'] = train_copy['text'].apply(lambda x: re.sub('\\s+', ' ', x))
train_copy.drop(train_copy[train_copy['text'] == ''].index, inplace=True)
train = train_copy


Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\shens\AppData\Local\Temp\jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built successfully.


In [7]:
test_dir = './test/original-microblog'
test_data_list = []
for data in os.listdir(test_dir):
    js = json.load(open(os.path.join(test_dir, data), encoding='utf-8'))
    t = pd.json_normalize(js, sep='_')
    t['mid'] = data.split('.')[0]
    test_data_list.append(t)
test_data = pd.concat(test_data_list, axis=0, ignore_index=True)

test = test_data[['mid', 'text']]
test_repost_dir = './test/repost'
for data in tqdm(os.listdir(test_repost_dir)):
    js = json.load(open(os.path.join(test_repost_dir, data), encoding='utf-8'))
    json_df = pd.json_normalize(js)
    json_df = json_df[~json_df['text'].str.contains('转发') & (json_df['text'] != '')]
    new_text = json_df['text'].str.cat(sep=' ')
    test.loc[test['mid'] == data.split('.')[0], 'text'] += new_text

100%|██████████| 1912/1912 [00:07<00:00, 257.86it/s]


In [8]:
def process_text(text):
    text = wordopt_cn(text)
    text = ' '.join(jieba.cut(text))
    text = remove_stopwords(text)
    text = re.sub('\\s+', ' ', text)
    return text

test['text'] = test['text'].apply(process_text)

test.to_csv('./test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['text'] = test['text'].apply(process_text)


In [9]:
LOWERCASE = False
VOCAB_SIZE = 228293

In [10]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

dataset = Dataset.from_pandas(test[['text']])


def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        yield dataset[i: i + 1000]["text"]


raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

100%|██████████| 1912/1912 [00:06<00:00, 285.01it/s]
100%|██████████| 4226/4226 [00:14<00:00, 282.03it/s]


In [11]:
def dummy(text):
    return text


vectorizer = TfidfVectorizer(lowercase=False, sublinear_tf=True, analyzer='word',
                             tokenizer=dummy,
                             preprocessor=dummy,
                             token_pattern=None, strip_accents='unicode'
                             )

vectorizer.fit(tokenized_texts_test)

vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(lowercase=False, sublinear_tf=True, vocabulary=vocab,
                             analyzer='word',
                             tokenizer=dummy,
                             preprocessor=dummy,
                             token_pattern=None, strip_accents='unicode'
                             )

X_train = vectorizer.fit_transform(tokenized_texts_train)


In [14]:
print(len(vocab))

206418


In [12]:
clf = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")
p6 = {'n_iter': 1500, 'verbose': -1, 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.05073909898961407,
      'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286,
      'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
lgb = LGBMClassifier(**p6)

# Creating the ensemble model
ensemble = VotingClassifier(estimators=[
    ('mnb', clf),
    ('sgd', sgd_model),
    ('lgb', lgb)],
    weights=[0.1, 0.45, 0.45],
    voting='soft',
    n_jobs=-1)

In [13]:
Y_train = train['is_rumor'].values
ensemble.fit(X_train, Y_train)