In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import pickle

In [14]:
def write_pickle(object, file_name):
    with open(file_name, "wb") as handle:
        pickle.dump(object, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load raw data

In [2]:
train_set_1 = pd.read_csv('train_set_1.csv', dtype={'id': str})
train_set_2 = pd.read_csv('train_set_2.csv', dtype={'id': str})
test_set = pd.read_csv('test_set.csv', dtype={'id': str})

In [3]:
train_set_1[train_set_1['text'].isna()]

Unnamed: 0,label,id,text,text_non_hastag


In [4]:
train_set_2[train_set_2['text'].isna()]

Unnamed: 0,label,id,text,text_non_hastag


In [5]:
test_set[test_set['text'].isna()]

Unnamed: 0,label,id,text,text_non_hastag


In [6]:
# df_test = df_test.dropna(subset=['text'])

In [7]:
test_set.head()

Unnamed: 0,label,id,text,text_non_hastag
0,1,7070502982655216923,p2 nhé mọi ngườichinhtri xuhuong tỏchấtriêng c...,p2 nhé mọi ngườichinhtri xuhuong tỏchấtriêng c...
1,0,7005431713895566618,capcut cầulông backhand ycl sanghoang59,capcut cầulông backhand ycl sanghoang59
2,1,7048453986843675906,capcut nguyenxuanphuc lêloi,capcut nguyenxuanphuc lêloi
3,0,6863250722155138309,electric love sorry for the subpar audio quali...,electric love sorry for the subpar audio quali...
4,0,7042150256489925890,tìm đất chính chủ của dân thuanchubds xuhuong ...,tìm đất chính chủ của dân thuanchubds xuhuong ...


# Preprocessing

In [8]:
import re
from underthesea import word_tokenize
# vncorenlp
# from pyvi import ViTokenizer

def is_number(text):
    try:
        float(text)
        return True
    except ValueError:
        return False

def preprocess_text(text): 
    try:
        text = re.sub(r'<[^>]*>', '', text)
        text = text.lower()
        text = word_tokenize(text, format="text")
        text = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_\[\]]',' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = ' '.join(map(lambda x: '<number>' if is_number(x) else x, text.split()))
        return text
    except:
        print(text)
        return text

In [9]:
train_set_1['preprocessed_text'] = train_set_1['text'].apply(preprocess_text)
train_set_2['preprocessed_text'] = train_set_2['text'].apply(preprocess_text)
test_set['preprocessed_text'] = test_set['text'].apply(preprocess_text)

train_set_1['preprocessed_text_non_hastag'] = train_set_1['text_non_hastag'].apply(preprocess_text)
train_set_2['preprocessed_text_non_hastag'] = train_set_2['text_non_hastag'].apply(preprocess_text)
test_set['preprocessed_text_non_hastag'] = test_set['text_non_hastag'].apply(preprocess_text)

In [10]:
# test_set

# Featurize (tf-idf)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/0.24/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidf#sklearn.feature_extraction.text.TfidfVectorizer

In [12]:
corpus = pd.concat([train_set_1['preprocessed_text'], train_set_2['preprocessed_text'], test_set['preprocessed_text']])

In [26]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [17]:
# write_pickle(vectorizer, 'tfidfVectorizer')

In [18]:
x_train_1 = vectorizer.transform(train_set_1['preprocessed_text'])
y_train_1 = train_set_1['label']

x_train_2 = vectorizer.transform(train_set_2['preprocessed_text'])
y_train_2 = train_set_2['label']

x_test = vectorizer.transform(test_set['preprocessed_text'])
y_test = test_set['label']

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
clf = LogisticRegression(random_state=0)
clf.fit(x_train_1, y_train_1)

In [21]:
# write_pickle(clf, 'text_clf')

## Evaluation

In [79]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
# from utils import calculate_accuracy, calculate_f1_score

### sklearn model

In [80]:
y_prediction = clf.predict(x_train_2)

In [81]:
accuracy_score(y_train_2, y_prediction)

0.9733376164471571

In [82]:
f1_score(y_train_2, y_prediction)

0.9664375252729479

In [83]:
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# confusion_matrix(y_test, y_pred).ravel()

In [84]:
y_prediction_test = clf.predict(x_test)

In [85]:
accuracy_score(y_test, y_prediction_test)

0.9715355805243445

In [86]:
f1_score(y_test, y_prediction_test)

0.9641509433962264

In [22]:
y_predict_score = clf.predict_proba(x_train_2)

In [97]:
import numpy
col_names = np.array(['text_label_0', 'text_label_1'])
submission_results = pd.DataFrame(y_predict_score, columns = col_names)
submission_results.insert(0, 'label', train_set_2['label'].astype(numpy.int8))
submission_results.insert(0, 'id', train_set_2['id'].astype(str))
submission_results.insert(0, 'text', train_set_2['text'])
# submission_results.insert(0, 'predict', y_pred)

In [98]:
submission_results

Unnamed: 0,text,id,label,text_label_0,text_label_1
0,việt nam vô địch bóngđá dokimphuc football tik...,6746395734930640129,0,0.952365,0.047635
1,người mang trọng tội với tổ tiên hồng quân liê...,7071073830848023834,1,0.215492,0.784508
2,tài sản thì cứ dấu dấu diếm diếm phải công kha...,6954907710366387458,1,0.249647,0.750353
3,lý do thủ môn không tin pepe pepe football bon...,7066615161271881006,0,0.935591,0.064409
4,part ii ạxuhuong volleyball bongchuyen,6925795138379713794,0,0.859645,0.140355
...,...,...,...,...,...
3108,2021 nhìn lại một năm đầy biến động nhinlai202...,7058856419461795073,1,0.136569,0.863431
3109,trả lời lng1606 lên luôn cho nóng nè unbox unb...,6974710162594548993,0,0.934135,0.065865
3110,mua cái áo mà được tặng quá trời luônxuhuong l...,6992891507560713498,0,0.844018,0.155982
3111,khai trừ đảng bộ trưởng nguyễn thanh long và c...,7106113028952755482,1,0.024934,0.975066


In [99]:
submission_results.to_csv('train_set_2_text_result.csv', index=False)

In [28]:
# submission_results.to_csv('text_prediction_result.csv', index=False)

In [87]:
# submission_results[(submission_results['predict'] == 0) & (submission_results['text_label_1'] < 0.5)]

In [43]:
preprocessed_text = preprocess_text('nguyen phu trong ha ha')
feature_vector = vectorizer.transform([preprocessed_text])
a = clf.predict_proba(feature_vector)[:,1][0]

In [44]:
a

0.4094707736862613

In [45]:
type(a)

numpy.float64