In [153]:
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

In [154]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [155]:
def read_file(filename):
    with open(filename,'r') as f:
        data = f.read()
        data = json.loads(data)
    f.close()

    data_set = []
    data_label = []
    for i,t in data.items():
        data_set.append(t['text'])
        data_label.append(t['label'])
    
    return data_set, data_label

train_set, train_label = read_file('train_total_balance_new.json')
dev_set, dev_label = read_file('dev.json')

    
with open('test-unlabelled.json','r') as f:
    test = f.read()
    test = json.loads(test)
    
test_set = []
for i,t in test.items():
    test_set.append(t['text'])

In [156]:
processed_train_set = [re.sub(r'http\S+', '', s) for s in train_set]
processed_dev_set = [re.sub(r'http\S+', '', s) for s in dev_set]

In [216]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+|\w+-\w+')

def my_tokenize(s):
#     tokenizer = TreebankWordTokenizer()
#     tokens = nltk.word_tokenize(s)
    tokens = tokenizer.tokenize(s)
    lemma_words=[lemmatizer.lemmatize(w) for w in tokens]
    return tokens

# max_features=200000 max_df=0.95,min_df=2
# analyzer string, {‘word’, ‘char’, ‘char_wb’} or callable
vectorizer = CountVectorizer(tokenizer=my_tokenize,lowercase=True,max_df=0.95,stop_words='english',ngram_range=(1,1)) 
x_train = vectorizer.fit_transform(processed_train_set)
x_train

<3400x45039 sparse matrix of type '<class 'numpy.int64'>'
	with 799208 stored elements in Compressed Sparse Row format>

In [217]:
# textttt = train_set[18]
# tokens = tokenizer.tokenize(textttt)
# print(textttt)
# print('================')
# rem_stop_words  = [w for w in tokens if len(w)>2 if not w in stopwords.words('english')]
# lemma_words=[lemmatizer.lemmatize(w) for w in rem_stop_words]
# print(lemma_words)

In [218]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_train_tfidf.shape

(3400, 45039)

In [219]:
vectorizer_dev = CountVectorizer(tokenizer=my_tokenize,lowercase=True,stop_words='english',
                                 vocabulary=vectorizer.vocabulary_)
x_dev = vectorizer_dev.fit_transform(processed_dev_set)
x_dev_tfidf = tfidf_transformer.fit_transform(x_dev)
print(x_dev_tfidf.shape)

(100, 45039)


In [222]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from tqdm import tqdm

grid_linear = {'penalty':['l2'],'C':[1,10,100,1000],'max_iter':[1000]}
clf = svm.LinearSVC(random_state=90042, tol=1e-4)

for z in tqdm(ParameterGrid(grid_linear)):
    clf.set_params(**z)
    clf.fit(x_train_tfidf, train_label)
    result = clf.predict(x_dev_tfidf)
    p, r, f, _ = precision_recall_fscore_support(dev_label, result, pos_label=1, average="binary")
    print("===============================")
    print("pre:",p," recall:",r," f1:",f)
    print(z)


  0%|          | 0/4 [00:00<?, ?it/s][A
 50%|█████     | 2/4 [00:00<00:00,  6.93it/s][A

pre: 0.7777777777777778  recall: 0.84  f1: 0.8076923076923077
{'C': 1, 'max_iter': 1000, 'penalty': 'l2'}
pre: 0.7678571428571429  recall: 0.86  f1: 0.8113207547169812
{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}



 75%|███████▌  | 3/4 [00:01<00:00,  2.83it/s][A

pre: 0.7818181818181819  recall: 0.86  f1: 0.819047619047619
{'C': 100, 'max_iter': 1000, 'penalty': 'l2'}



100%|██████████| 4/4 [00:01<00:00,  2.09it/s][A

pre: 0.7818181818181819  recall: 0.86  f1: 0.819047619047619
{'C': 1000, 'max_iter': 1000, 'penalty': 'l2'}


In [None]:
C = [0.1,1,10,100,1000,10000]
gamma = ['scale',1e-4,1e-3,1e-2,1e-1,1,10]
grid_svc = {'C':C,'gamma':gamma}
clf1 = svm.SVC()
f1_lists = []
for z in tqdm(ParameterGrid(grid_svc)):
    clf1.set_params(**z)
    clf1.fit(x_train_tfidf, train_label)
    result1 = clf1.predict(x_dev_tfidf)
    p, r, f, _ = precision_recall_fscore_support(dev_label, result1, pos_label=1, average="binary")
    f1_lists.append(f)
    print("===============================")
    print("pre:",p," recall:",r," f1:",f)
    print(z)


  0%|          | 0/42 [00:00<?, ?it/s][A
  2%|▏         | 1/42 [00:17<12:13, 17.89s/it][A

pre: 0.8  recall: 0.64  f1: 0.7111111111111111
{'C': 0.1, 'gamma': 'scale'}


  'precision', 'predicted', average, warn_for)

  5%|▍         | 2/42 [00:33<11:29, 17.25s/it][A

pre: 0.0  recall: 0.0  f1: 0.0
{'C': 0.1, 'gamma': 0.0001}



  7%|▋         | 3/42 [00:49<10:53, 16.75s/it][A

pre: 0.0  recall: 0.0  f1: 0.0
{'C': 0.1, 'gamma': 0.001}


### Results
----------
Linear SVC
* pre: 0.8260869565217391  recall: 0.76  f1: 0.7916666666666667
{'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 0}    
----------
SVC
* pre: 0.918918918918919  recall: 0.68  f1: 0.7816091954022989
{'C': 0.1, 'gamma': 'scale', 'max_iter': 1000, 'random_state': 90042}
* pre: 0.8085106382978723  recall: 0.76  f1: 0.7835051546391754
{'C': 1, 'gamma': 'scale', 'max_iter': 1000, 'random_state': 90042}
* pre: 0.8260869565217391  recall: 0.76  f1: 0.7916666666666667
{'C': 5, 'gamma': 0.1, 'max_iter': 1000, 'random_state': 90042}
* pre: 0.8125  recall: 0.78  f1: 0.7959183673469388
{'C': 100, 'gamma': 0.1, 'max_iter': 1000, 'random_state': 90042}

* pre: 0.8125  recall: 0.78  f1: 0.7959183673469388
{'C': 1, 'gamma': 'scale'}   ---- max_df=0.95 min_df=5 

------------------
#### balance new
-------
Linear
pre: 0.7884615384615384  recall: 0.82  f1: 0.803921568627451
{'C': 1, 'max_iter': 1000, 'penalty': 'l2'}
pre: 0.7818181818181819  recall: 0.86  f1: 0.819047619047619
{'C': 100, 'max_iter': 1000, 'penalty': 'l2'}     

-------
SVC
pre: 0.7818181818181819  recall: 0.86  f1: 0.819047619047619
{'C': 100, 'gamma': 0.01}

In [209]:
error = 0
for i in range(len(result)):
    if dev_label[i] != result[i]:
        error+=1
        print("NO.{}:".format(i),"gt:",dev_label[i],"p:",int(result[i]))
print(error)

NO.3: gt: 0 p: 1
NO.15: gt: 0 p: 1
NO.16: gt: 0 p: 1
NO.18: gt: 0 p: 1
NO.19: gt: 0 p: 1
NO.23: gt: 1 p: 0
NO.29: gt: 1 p: 0
NO.31: gt: 1 p: 0
NO.36: gt: 0 p: 1
NO.41: gt: 0 p: 1
NO.42: gt: 1 p: 0
NO.63: gt: 0 p: 1
NO.72: gt: 1 p: 0
NO.75: gt: 1 p: 0
NO.78: gt: 0 p: 1
NO.87: gt: 0 p: 1
NO.93: gt: 0 p: 1
NO.96: gt: 0 p: 1
NO.97: gt: 1 p: 0
19


In [208]:
error = 0
for i in range(len(result1)):
    if dev_label[i] != result1[i]:
        error+=1
        print("NO.{}:".format(i),"gt:",dev_label[i],"p:",int(result1[i]))
print(error)

NO.0: gt: 1 p: 0
NO.1: gt: 1 p: 0
NO.2: gt: 1 p: 0
NO.4: gt: 1 p: 0
NO.5: gt: 1 p: 0
NO.6: gt: 1 p: 0
NO.7: gt: 1 p: 0
NO.9: gt: 1 p: 0
NO.10: gt: 1 p: 0
NO.11: gt: 1 p: 0
NO.12: gt: 1 p: 0
NO.13: gt: 1 p: 0
NO.17: gt: 1 p: 0
NO.23: gt: 1 p: 0
NO.26: gt: 1 p: 0
NO.29: gt: 1 p: 0
NO.31: gt: 1 p: 0
NO.32: gt: 1 p: 0
NO.35: gt: 1 p: 0
NO.37: gt: 1 p: 0
NO.38: gt: 1 p: 0
NO.39: gt: 1 p: 0
NO.40: gt: 1 p: 0
NO.42: gt: 1 p: 0
NO.43: gt: 1 p: 0
NO.44: gt: 1 p: 0
NO.49: gt: 1 p: 0
NO.60: gt: 1 p: 0
NO.61: gt: 1 p: 0
NO.62: gt: 1 p: 0
NO.64: gt: 1 p: 0
NO.65: gt: 1 p: 0
NO.66: gt: 1 p: 0
NO.72: gt: 1 p: 0
NO.73: gt: 1 p: 0
NO.75: gt: 1 p: 0
NO.76: gt: 1 p: 0
NO.77: gt: 1 p: 0
NO.79: gt: 1 p: 0
NO.81: gt: 1 p: 0
NO.82: gt: 1 p: 0
NO.84: gt: 1 p: 0
NO.86: gt: 1 p: 0
NO.88: gt: 1 p: 0
NO.89: gt: 1 p: 0
NO.90: gt: 1 p: 0
NO.91: gt: 1 p: 0
NO.94: gt: 1 p: 0
NO.97: gt: 1 p: 0
NO.99: gt: 1 p: 0
50


In [34]:
# predict on test and give an output
processed_test_set = [re.sub(r'http\S+', '', s) for s in test_set]
vectorizer_test = CountVectorizer(tokenizer=my_tokenize,lowercase=True,stop_words='english',
                                 vocabulary=vectorizer.vocabulary_)
x_test = vectorizer_test.fit_transform(processed_test_set)
# x_dev.shape
x_test_tfidf = tfidf_transformer.fit_transform(x_test)
print(x_test_tfidf.shape)

(1410, 45913)


In [36]:
result_test = clf.predict(x_test_tfidf)
print(result_test)
result_dict_test = {}
for i in range(len(result_test)):
    key = "test-{}".format(i)
    result_dict_test[key] = {"label":int(result_test[i])}

with open("test-output.json","w") as f:
    json.dump(result_dict_test,f)
    print("finish")

[1 0 0 ... 1 1 0]
finish


In [None]:
# from sklearn.decomposition import TruncatedSVD
# lsa = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
# SVD_x_train = lsa.fit_transform(x_train_tfidf)
# SVD_x_dev = lsa.fit_transform(x_dev_tfidf)
# SVD_x_train.shape