In [259]:
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

In [260]:
# register preprocessing class
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [261]:
# read data set(train,dev,test)
with open('train.json','r') as f:
    text = f.read()
    text = json.loads(text)
f.close()

train_set = []
for i,t in text.items():
    train_set.append(t['text'])
    
with open('dev.json','r') as f:
    dev = f.read()
    dev = json.loads(dev)
f.close()

dev_set = []
dev_label = []
for i,t in dev.items():
    dev_set.append(t['text'])
    dev_label.append(t['label'])

with open('test-unlabelled.json','r') as f:
    test = f.read()
    test = json.loads(test)
    
test_set = []
for i,t in test.items():
    test_set.append(t['text'])

In [265]:
# TF-IDF
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+|\w+-\w+')

def my_tokenize(s):
    tokens = tokenizer.tokenize(s)
    lemma_words=[lemmatizer.lemmatize(w) for w in tokens]
    return tokens

processed_train_set = [re.sub(r'http\S+', '', s) for s in train_set]
vectorizer = CountVectorizer(tokenizer=my_tokenize,lowercase=True,stop_words='english',ngram_range=(1,1)) 
x_train = vectorizer.fit_transform(processed_train_set)
x_train

<1168x23859 sparse matrix of type '<class 'numpy.int64'>'
	with 242075 stored elements in Compressed Sparse Row format>

In [304]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_train_tfidf.shape

(1168, 23859)

In [244]:
# generate dev set
processed_dev_set = [re.sub(r'http\S+', '', s) for s in dev_set]
vectorizer_dev = CountVectorizer(tokenizer=my_tokenize,lowercase=True,stop_words='english',
                                 vocabulary=vectorizer.vocabulary_)
x_dev = vectorizer_dev.fit_transform(processed_dev_set)
x_dev_tfidf = tfidf_transformer.fit_transform(x_dev)
print(x_dev_tfidf.shape)

(100, 23859)


In [253]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from tqdm import tqdm

# grid search
# grid = {'gamma' :[1e-6,1e-5,0.0001,0.001,0.01,0.1],
#         'nu' : np.linspace(0.40, 0.80, 40)}
grid = {'gamma' : [0.0001],
        'nu' : [0.47]}
clf = svm.OneClassSVM(kernel="rbf")
#kernel：核函数（一般使用高斯核）

　　#nu：设定训练误差(0, 1]，表示异常点比例，默认值为0.5
    # gamma 惩罚项
result_dict = {}
count = 0
f1_max = []
for z in tqdm(ParameterGrid(grid)):
    clf.set_params(**z)
    clf.fit(x_train_tfidf)
    result = clf.predict(x_dev_tfidf)
    result = [i if i==1 else 0 for i in result]
    p, r, f, _ = precision_recall_fscore_support(dev_label, result, pos_label=1, average="binary")
    keys = str(count)
    count+=1
    result_dict[keys] = {"para":z,"p":p,"r":r,"f1":f}
    
    f1_max.append(f)
    
max_f = np.max(f1_max)
print("max_f1:",max_f)
a = f1_max.index(max_f)
#

100%|██████████| 1/1 [00:01<00:00,  1.27s/it]

max_f1: 0.7401574803149605





In [50]:
# write result into json file
result_dict = {}
for i in range(len(result)):
    key = "dev-{}".format(i)
    result_dict[key] = {"label":int(result[i])}

import pickle
with open("dev-predict.json","w") as f:
    json.dump(result_dict,f)
    print("finish")

<class 'dict'>
finish


In [75]:
# predict on test and give an output
processed_test_set = [re.sub(r'http\S+', '', s) for s in test_set]
vectorizer_test = CountVectorizer(tokenizer=my_tokenize,lowercase=True,stop_words='english',
                                 vocabulary=vectorizer.vocabulary_)
x_test = vectorizer_test.fit_transform(processed_test_set)
x_test_tfidf = tfidf_transformer.fit_transform(x_test)
print(x_test_tfidf.shape)

result_test = clf.predict(x_test_tfidf)
result_test = [i if i==1 else 0 for i in result_test]

result_dict_test = {}
for i in range(len(result_test)):
    key = "test-{}".format(i)
    result_dict_test[key] = {"label":int(result_test[i])}

with open("test-output.json","w") as f:
    json.dump(result_dict_test,f)
    print("finish")

(1410, 21179)
finish
