# **Setup**

In [44]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.model_selection import train_test_split
import pandas as pd 
import re
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline  import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import pickle
import joblib

**Check sklearn version**

In [45]:
sklearn_version = sklearn.__version__
print(sklearn_version)

0.22.2.post1


**Get data from github**

In [46]:
!wget https://github.com/KTDLVB-UD-CQ2018/question_classification/blob/master/app/VietnamseQuestionDataset.xlsx?raw=true -O dataset.xlsx

--2021-04-18 10:56:51--  https://github.com/KTDLVB-UD-CQ2018/question_classification/blob/master/app/VietnamseQuestionDataset.xlsx?raw=true
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/KTDLVB-UD-CQ2018/question_classification/raw/master/app/VietnamseQuestionDataset.xlsx [following]
--2021-04-18 10:56:51--  https://github.com/KTDLVB-UD-CQ2018/question_classification/raw/master/app/VietnamseQuestionDataset.xlsx
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/KTDLVB-UD-CQ2018/question_classification/master/app/VietnamseQuestionDataset.xlsx [following]
--2021-04-18 10:56:51--  https://raw.githubusercontent.com/KTDLVB-UD-CQ2018/question_classification/master/app/VietnamseQuestionDataset.xlsx
Resolving raw.githubusercontent.com (raw.githubuserconten

In [47]:
df = pd.read_excel("/content/dataset.xlsx", sheet_name = 0, header=0, skiprows = 3, usecols = [1,2,4])

**Làm sạch dữ liệu bằng cách loại bỏ các dấu câu, chữ số, đường link,..**

In [48]:
def Clean_Corpus(sent):
  sent = sent.lower()
  sent = re.sub('\[.*?\]', '', sent)
  sent = re.sub('https?://\S+|www\.\S+', '', sent)
  sent = re.sub('<.*?>+', '', sent)
  sent = re.sub('[%s]' % re.escape(string.punctuation), '', sent)
  sent = re.sub('\n', '', sent)
  sent = re.sub('\w*\d\w*', '', sent)
  return sent


In [49]:
df['Câu hỏi'] = df['Câu hỏi'].apply(Clean_Corpus)

**Fix some issues with incorrect labels**

In [50]:
def normalize_categories(cat):
  res = cat
  if res == "ENT":
    res = "ENTY"
  if res == "Select":
    res = "SELECT"
  if res == "NUM.data":
    res = "NUM.date"
  return res

In [51]:
df['LOẠI'] = df['LOẠI'].apply(normalize_categories)

**Phân ra tập Train và tập Test**

In [52]:
train_df = df[df['DATASET'] == 'TRAIN']
test_df = df[df['DATASET'] == 'TEST']

In [53]:
train_df.shape

(1837, 3)

In [54]:
test_df.shape

(442, 3)

**Lấy ra các dữ liệu cần cho mô hình**


Vì trong dataframe chỉ có 2 cột 'Câu hỏi' và 'LOẠI' là hữu ích cho mô hình

In [55]:
X_train = train_df['Câu hỏi'].to_list()
y_train = train_df['LOẠI'].to_list()
X_test = test_df['Câu hỏi'].to_list()
y_test = test_df['LOẠI'].to_list()

**Phân tập Train ban đầu thành 2 tập Train và Validation**

In [56]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.33, random_state = 0, stratify=y_train)

In [57]:
len(X_train)

1230

In [58]:
len(X_val)

607

**Create TF-IDF transformer**

In [59]:
tfidf = TfidfTransformer()

# **Huấn luyện theo mô hình Naive Bayes**

**Thử nghiệm chọn ra mô hình tốt nhất**

In [60]:
li_fit_prior = [True, False]
li_alpha = [0.9,  0.99, 1.0, 1.1, 1.11, 1.111, 1.2, 1.21]
li_ngram = [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]

best_alpha = 0 
best_fit_prior = False
best_ngram = (0,0)
best_score = 0.0

for fit_prior in li_fit_prior:
  for alpha in li_alpha:
    for ngram in li_ngram:
      vect = CountVectorizer(ngram_range=ngram)
      pipe = make_pipeline(vect, tfidf)
      pipe.fit_transform(X_train)
      pipe.transform(X_val)
      nb = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
      full_pipeline = make_pipeline(pipe, nb)
      full_pipeline.fit(X_train, y_train)
      y_pred_class = full_pipeline.predict(X_val)
      score = metrics.accuracy_score(y_val, y_pred_class)
      if score > best_score:
        best_alpha = alpha
        best_fit_prior = fit_prior
        best_ngram = ngram
        best_score = score

In [61]:
print(best_alpha)
print(best_fit_prior)
print(best_ngram)
print(best_score)

1.1
False
(1, 3)
0.728171334431631


**Huấn luyện theo mô hình tốt nhất vừa tìm được**


In [62]:
nb = MultinomialNB(alpha = best_alpha, fit_prior = best_fit_prior)
vect = CountVectorizer(ngram_range=best_ngram)
pipe = make_pipeline(vect, tfidf)
pipe.fit_transform(X_train, y_train)
pipe.transform(X_val)
pipe.transform(X_test)
full_pipeline = make_pipeline(pipe, nb)

**Thực hiện dự đoán trên tập Test**

In [63]:
full_pipeline.fit(X_train, y_train)
y_pred_class = full_pipeline.predict(X_test)
print('Accuracy: ')
print(metrics.accuracy_score(y_test, y_pred_class))


Accuracy: 
0.7239819004524887


**Dump models**

In [64]:
joblib.dump(full_pipeline, "pipeline_nb.joblib")

['pipeline_nb.joblib']

# **Huấn luyện theo mô hình Logistic Regression**

**Thử nghiệm chọn ra mô hình tốt nhất**

In [65]:
li_C = [10, 20, 30, 40, 50, 60, 70, 80]
li_ngram = [(1,1), (1,2), (1,3), (2,2), (2,3)]

best_C = 0
best_ngram = (0,0)
best_score = 0.0

for C in li_C:
    for ngram in li_ngram:
      vect = CountVectorizer(ngram_range=ngram)
      pipe = make_pipeline(vect, tfidf)
      pipe.fit_transform(X_train)
      pipe.transform(X_val)
      logistic_model = LogisticRegression(C=C, class_weight='balanced', multi_class='ovr')
      full_pipeline = make_pipeline(pipe, logistic_model)
      full_pipeline.fit(X_train, y_train)
      y_pred_class = full_pipeline.predict(X_val)
      score = metrics.accuracy_score(y_val, y_pred_class)
      if score > best_score:
        best_ngram = ngram
        best_C = C
        best_score = score

In [66]:
print(best_ngram)
print(best_C)
print(best_score)

(1, 2)
70
0.7957166392092258


**Huấn luyện theo mô hình tốt nhất vừa tìm được**

In [67]:
logistic_model = LogisticRegression(C=best_C, class_weight='balanced', multi_class='ovr')
vect = CountVectorizer(ngram_range=best_ngram)
pipe = make_pipeline(vect, tfidf)
pipe.fit_transform(X_train, y_train)
pipe.transform(X_val)
pipe.transform(X_test)
full_pipeline = make_pipeline(pipe, logistic_model)

**Thực hiện dự đoán kết quả của tập Test**

In [68]:
full_pipeline.fit(X_train, y_train)
y_pred_class = full_pipeline.predict(X_test)
print('Accuracy: ')
print(metrics.accuracy_score(y_test, y_pred_class))


Accuracy: 
0.8031674208144797


**Dump models**

In [69]:
joblib.dump(full_pipeline, "pipeline_logreg.joblib")

['pipeline_logreg.joblib']

# **Huấn luyện theo mô hình Support Vector Machine**

**Thử nghiệm chọn ra mô hình tốt nhất**

In [70]:
svm_C = [1, 10, 20, 30, 40, 50, 60, 70, 80]
svm_ngram = [(1,1), (1,2), (1,3), (2,2), (2,3)]

best_C = 0
best_ngram = (0,0)
best_score = 0.0

for C in svm_C:
    for ngram in svm_ngram:
      vect = CountVectorizer(ngram_range=ngram)
      pipe = make_pipeline(vect, tfidf)
      pipe.fit_transform(X_train)
      pipe.transform(X_val)
      svm_model = svm.SVC(C=C, kernel='linear', degree=3, gamma='auto')
      full_pipeline = make_pipeline(pipe, svm_model)
      full_pipeline.fit(X_train, y_train)
      y_pred_class = full_pipeline.predict(X_val)
      score = metrics.accuracy_score(y_val, y_pred_class)
      if score > best_score:
        best_ngram = ngram
        best_C = C
        best_score = score

In [71]:
print(best_ngram)
print(best_C)
print(best_score)

(1, 2)
10
0.7891268533772653


**Huấn luyện theo mô hình tốt nhất vừa tìm được**

In [72]:
svm_model = svm.SVC(C=best_C, kernel='linear', degree=3, gamma='auto', probability=True)
vect = CountVectorizer(ngram_range=best_ngram)
pipe = make_pipeline(vect, tfidf)
pipe.fit_transform(X_train, y_train)
pipe.transform(X_val)
pipe.transform(X_test)
full_pipeline = make_pipeline(pipe, svm_model)

**Thực hiện dự đoán kết quả của tập Test**

In [73]:
full_pipeline.fit(X_train, y_train)
y_pred_class = full_pipeline.predict(X_test)
print('Accuracy: ')
print(metrics.accuracy_score(y_test, y_pred_class))


Accuracy: 
0.7941176470588235


In [87]:
full_pipeline.steps[1][1]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

**Dump models**

In [74]:
joblib.dump(full_pipeline, "pipeline_svm.joblib")

['pipeline_svm.joblib']

# **Huấn luyện theo mô hình k-nearest neighbor**

**Thử nghiệm chọn ra mô hình tốt nhất**

In [75]:
knn_neighbors = list(range(1,10))
knn_ngram = [(1,1), (1,2), (1,3), (2,2), (2,3)]

best_n = 0
best_ngram = (0,0)
best_score = 0.0

for n_neighbors in knn_neighbors:
    for ngram in li_ngram:
      vect = CountVectorizer(ngram_range=ngram)
      pipe = make_pipeline(vect, tfidf)
      pipe.fit_transform(X_train)
      pipe.transform(X_val)
      knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
      full_pipeline = make_pipeline(pipe, knn_model)
      full_pipeline.fit(X_train, y_train)
      y_pred_class = full_pipeline.predict(X_val)
      score = metrics.accuracy_score(y_val, y_pred_class)
      if score > best_score:
        best_ngram = ngram
        best_n = n_neighbors
        best_score = score

In [76]:
print(best_ngram)
print(best_n)
print(best_score)

(1, 2)
9
0.6738056013179572


**Huấn luyện theo mô hình tốt nhất vừa tìm được**

In [77]:
knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
vect = CountVectorizer(ngram_range=best_ngram)
pipe = make_pipeline(vect, tfidf)
pipe.fit_transform(X_train, y_train)
pipe.transform(X_val)
pipe.transform(X_test)
full_pipeline = make_pipeline(pipe, knn_model)

**Thực hiện dự đoán kết quả của tập Test**

In [78]:
full_pipeline.fit(X_train, y_train)
y_pred_class = full_pipeline.predict(X_test)
print('Accuracy: ')
print(metrics.accuracy_score(y_test, y_pred_class))


Accuracy: 
0.6561085972850679


**Dump models**

In [79]:
joblib.dump(full_pipeline, "pipeline_knn.joblib")

['pipeline_knn.joblib']