# Paraphrasing Sentences Optimize Classification Accuracy

## Programming Language: Python

In [88]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from random import choice
import pandas as pd
import numpy as np
import re
import html
import string
from sklearn.model_selection import cross_val_score

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant

In [89]:
pd.options.mode.chained_assignment = None # deal with SettingWithCopyWarning

nltk.download('wordnet')
nltk.download('stopwords')
stopwords_nltk = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


Data = pd.read_csv("COVID19_Dataset.csv")
Data

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yurui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yurui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


## Preprocessing tweets 

- Remove useless words from tweets (Ex: url, digitis, punctuations, @__)
- Use <b>lemmatization</b> to returns an actual word of the language.

In [90]:
## preprocessing
Data['clean_tweet'] = Data['Tweet']

clean_text = ""
for i in range(0, Data['Tweet'].shape[0]):
    text = Data['Tweet'].iloc[i].lower() # lowercase
    text = re.sub(r'http\S+','', text) # remove url
    text = html.unescape(text) # convert XML to characters
    text = re.sub(r"[^A-z@]", " ", text) # remove digits,punctuations except @
    text = re.sub(r'@\S+','', text) # remove @__words
    text = ' '.join([token for token in text.split()])
    
   # clean stopwords
    tokenized_text = word_tokenize(text)
    tokenized_text_without_stopwords = [k for k in tokenized_text if not k in stopwords_nltk] 
   
    # lemmatization
    tokenized_text_without_stopwords_lem = [] 
    for j in range(0, len(tokenized_text_without_stopwords)):
        tokenized_text_without_stopwords_lem.append(lemmatizer.lemmatize(tokenized_text_without_stopwords[j]))
    
    tokenized_text_without_stopwords_lem = ' '.join(tokenized_text_without_stopwords_lem)
    clean_text = tokenized_text_without_stopwords_lem
    Data['clean_tweet'].iloc[i] = clean_text

## Use CNN to classify tweets reliable/ unreliable

In [91]:
BASE_DIR = os.getcwd()
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B') # source: https://nlp.stanford.edu/projects/glove/
                                               # using on embedding
MAX_SEQUENCE_LENGTH = 500
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2 # 20% for testing 80% for training

In [92]:
from sklearn.model_selection import train_test_split

tweets = Data['clean_tweet']
labels = Data['Is_Unreliable']
labels_index = {'fake':1, 'real':0}

train_texts, test_texts, train_labels, test_labels = train_test_split(tweets, labels, train_size= 0.7, random_state=1)

In [93]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words= MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts) # Converting text to a vector of word indices
                                                            # the words indices for each sentence
test_sequences = tokenizer.texts_to_sequences(test_texts)

word_index = tokenizer.word_index # words from all docs
print('Found %s unique tokens.' % len(word_index))

Found 1784 unique tokens.


In [94]:
# initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels = to_categorical(train_labels, num_classes = 2, dtype ="int32")
test_labels = to_categorical(test_labels, num_classes = 2, dtype ="int32")

# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)

trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])

x_train = trainvalid_data[ :-num_validation_samples] # not include num_validation_samples
y_train = trainvalid_labels[ :-num_validation_samples]

x_val = trainvalid_data[-num_validation_samples: ] # only num_validation_samples
y_val = trainvalid_labels[-num_validation_samples: ]

#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [95]:
print('Preparing embedding matrix.')
# first, build index mapping words in the embeddings set to their embedding vector
embeddings_index = {}

with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
#print(embeddings_index["google"]) # each word has 100 dims

# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 # 13831 + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) # 13832(words)x 100(dim)

for word, i in word_index.items(): # key,value in dictionary.items()
    if i > MAX_NUM_WORDS:          # if word corresponding value > MAX_NUM_WORDS(20000) would ignore below function
        continue                   # do next word.
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:          # words not found in embedding index will be all-zeros.
       embedding_matrix[i] = embedding_vector # every words would have own vectors.

# load these pre-trained word embeddings into an Embedding layer
# note that we set 'trainable = False' so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,     # 13832
                            EMBEDDING_DIM, # 100 (channel)
                            embeddings_initializer= Constant(embedding_matrix),
                            input_length= MAX_SEQUENCE_LENGTH, # 500 (width)
                            trainable=False) # don't need weight for words
print("Preparing of embedding matrix is done")

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.
Preparing of embedding matrix is done


In [96]:
print('Define a 1D CNN model.')

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)                   # input layer: 500x100

cnnmodel.add(Conv1D(128, 5, activation='relu')) # 128 output space; filter size=5; activation function
cnnmodel.add(MaxPooling1D(5))                   # max pool size = 5 (maximum value of window size)
cnnmodel.add(Dropout(0.25))                     # may avoid overfitting

cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Dropout(0.25))

cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dropout(0.25))

cnnmodel.add(Dense(len(labels_index), activation='softmax')) # classify labels 0 or 1, so is 2; 'softmax' get probability

cnnmodel.compile(loss='categorical_crossentropy', # loss function
                 optimizer='Adam',
                 metrics=['acc'])

Define a 1D CNN model.


In [97]:
print(x_train.shape)
print(y_train.shape)
cnn_train = cnnmodel.fit(x_train, y_train, # data, label
                         batch_size= 32,   # each time for 32 samples for training set until all samples for 1 epoch.
                         epochs = 5,  
                         verbose = 1,      # return some informations during training
                         validation_data= (x_val, y_val))

# epoch , train how many time, low: underfitting, high: overfitting

(314, 500)
(314, 2)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [98]:
score_train, acc_train = cnnmodel.evaluate(x_train, y_train)
print('Training accuracy with CNN:', acc_train)

score_test, acc_test = cnnmodel.evaluate(test_data, test_labels)
print('Testing accuracy with CNN:', acc_test)

Training accuracy with CNN: 0.9235668778419495
Testing accuracy with CNN: 0.7321428656578064


## Fit on classification models

- Use <b>TF-IDF</b> to vectorize words.
- Fit on <b>Logistic Regression</b> and <b>SVM</b> to classify tweets.

In [99]:
# step 1.
import sklearn
from sklearn.model_selection import train_test_split

X = Data['clean_tweet']
y = Data['Is_Unreliable']

In [100]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1))

X_dtm = tfidf.fit_transform(X)

In [101]:
# tfidf-logistic
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression #import
from sklearn.metrics import classification_report, confusion_matrix

logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
scores = cross_val_score(logreg, X_dtm, y, cv= 10) # 10 folds validation
print( "tfidf Logistic Regression Accuracy with 10-folds validation: ", round(np.mean(scores),3))

a = ("tfidf Logistic Regression Accuracy: "+ str(round(np.mean(scores),3)))

tfidf Logistic Regression Accuracy with 10-folds validation:  0.812


- Pipeline for SVC
- Automated words vetorization & SVM to get accuracy.

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

svc = SVC(probability=True)
pipe = Pipeline([
('vectorize', tfidf),
('classify', SVC())
])

In [103]:
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
# set up parameter grid
params = {
'classify__kernel': kernel,
'classify__C': C
}

In [104]:
from sklearn.model_selection import cross_validate, KFold, GridSearchCV

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 10, shuffle = True, random_state = 1)

# Set up GridSearch for inner loop
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

In [105]:
scores = cross_validate(grid_SVC,
                        X = X,
                        y = y,
                        cv = outer_cv,
                        scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                        return_estimator = True)

In [106]:
accuracy = scores['test_accuracy']
print(accuracy)
b = ("tfidf SVC Accuracy: "+ str(round(accuracy.mean(),3)))
print("\n"+b)

[0.83928571 0.78571429 0.80357143 0.82142857 0.875      0.78571429
 0.76785714 0.89285714 0.82142857 0.80357143]

tfidf SVC Accuracy: 0.82


## Parapharse tweets

- paraphrase 5 sentences for each train data tweet to expand dataset.
- Use nltk module.
- Check synonyms words and replace original words to create new sentence.

In [111]:
# step 2.
## paraphrase 5 sentences for each train data tweet (560*0.7 + 560*0.7*5)##

X = Data['clean_tweet']
y = Data['Is_Unreliable']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size= 0.7, random_state=1) # 70%/ 30%

all_train_data = y_train.to_frame().join(X_train.to_frame()) # 392

new_train_data = all_train_data.append(all_train_data) # 392+392 = 784
new_train_data = new_train_data.append(new_train_data) # 784+784 = 1568
new_train_data = new_train_data.append(all_train_data) # 1568+392 = 1960
new_train_data = new_train_data.append(all_train_data) # 1960+392 = 2352

for i in range(0,len(all_train_data)):
     new_train_data.iloc[i*5] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+1] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+2] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+3] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+4] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+5] = all_train_data.iloc[i]

In [112]:
# paraphrase def #

def tag(sentence): # tag each word's part of speech
 words = word_tokenize(sentence)
 words = pos_tag(words)
 return words

def paraphraseable(tag): 
 return tag.startswith('NN') or tag == 'VB' or tag.startswith('JJ')

def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

def synonyms(word, tag):
    lemma_lists = [ss.lemmas() for ss in wn.synsets(word, pos(tag))]
    lemmas = [lemma.name() for lemma in sum(lemma_lists, [])]
    return set(lemmas)

def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t):
    syns = synonyms(word, t)
    if syns:
     if len(syns) > 1:
      yield [word, list(syns)]
      continue
   yield [word, []]

def paraphrase(sentence):
 return [x for x in synonymIfExists(sentence)]

def generator(sentence):
    sentence = paraphrase(sentence)
    text = list(range(len(sentence)))
    for i in range(0, len(sentence)):
        if sentence[i][1] == []:
            text[i] = sentence[i][0]
        else:
            text[i] = choice(sentence[i][1])
        
    text = ' '.join(text)
    return text

In [113]:
# add paraphrasing 3 sentences for each tweet 
for i in range(0,len(all_train_data)):
    new_train_data.iloc[i*5+1,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+2,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+3,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+4,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+5,1] = generator(all_train_data.iloc[i,1])

## Try CNN again

In [114]:
new_test_data = y_test.to_frame().join(X_test.to_frame())
all_new_data = new_train_data.append(new_test_data)

tweets = all_new_data['clean_tweet']
labels = all_new_data['Is_Unreliable']
labels_index = {'fake':1, 'real':0}

train_texts, test_texts, train_labels, test_labels = train_test_split(tweets, labels, train_size= 0.7, random_state=1)

In [115]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words= MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts) # Converting text to a vector of word indices
                                                            # the words indices for each sentence
test_sequences = tokenizer.texts_to_sequences(test_texts)

word_index = tokenizer.word_index # words from all docs
print('Found %s unique tokens.' % len(word_index))

Found 4279 unique tokens.


In [116]:
# initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels = to_categorical(train_labels, num_classes = 2, dtype ="int32")
test_labels = to_categorical(test_labels, num_classes = 2, dtype ="int32")

# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)

trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])

x_train = trainvalid_data[ :-num_validation_samples] # not include num_validation_samples
y_train = trainvalid_labels[ :-num_validation_samples]

x_val = trainvalid_data[-num_validation_samples: ] # only num_validation_samples
y_val = trainvalid_labels[-num_validation_samples: ]

#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [117]:
print('Preparing embedding matrix.')
# first, build index mapping words in the embeddings set to their embedding vector
embeddings_index = {}

with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
#print(embeddings_index["google"]) # each word has 100 dims

# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 # 13831 + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) # 13832(words)x 100(dim)

for word, i in word_index.items(): # key,value in dictionary.items()
    if i > MAX_NUM_WORDS:          # if word corresponding value > MAX_NUM_WORDS(20000) would ignore below function
        continue                   # do next word.
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:          # words not found in embedding index will be all-zeros.
       embedding_matrix[i] = embedding_vector # every words would have own vectors.

# load these pre-trained word embeddings into an Embedding layer
# note that we set 'trainable = False' so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,     # 13832
                            EMBEDDING_DIM, # 100 (channel)
                            embeddings_initializer= Constant(embedding_matrix),
                            input_length= MAX_SEQUENCE_LENGTH, # 500 (width)
                            trainable=False) # don't need weight for words
print("Preparing of embedding matrix is done")

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.
Preparing of embedding matrix is done


In [118]:
print('Define a 1D CNN model.')

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)                   # input layer: 500x100

cnnmodel.add(Conv1D(128, 5, activation='relu')) # 128 output space; filter size=5; activation function
cnnmodel.add(MaxPooling1D(5))                   # max pool size = 5 (maximum value of window size)
cnnmodel.add(Dropout(0.25))                     # may avoid overfitting

cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Dropout(0.25))

cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dropout(0.25))

cnnmodel.add(Dense(len(labels_index), activation='softmax')) # classify labels 0 or 1, so is 2; 'softmax' get probability

cnnmodel.compile(loss='categorical_crossentropy', # loss function
                 optimizer='Adam',
                 metrics=['acc'])

Define a 1D CNN model.


In [119]:
print(x_train.shape)
print(y_train.shape)
cnn_train = cnnmodel.fit(x_train, y_train, # data, label
                         batch_size= 32,   # each time for 32 samples for training set until all samples for 1 epoch.
                         epochs = 5,  
                         verbose = 1,      # return some informations during training
                         validation_data= (x_val, y_val))

# epoch , train how many time, low: underfitting, high: overfitting

(1412, 500)
(1412, 2)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [120]:
score_train, acc_train = cnnmodel.evaluate(x_train, y_train)
print('Training accuracy with CNN:', acc_train)

score_test, acc_test = cnnmodel.evaluate(test_data, test_labels)
print('Testing accuracy with CNN:', acc_test)

Training accuracy with CNN: 0.9674220681190491
Testing accuracy with CNN: 0.7830687761306763


- CNN classification accuracy enhance from 73.2% to 78.3%

## Test model accuracy again

- Use TF-IDF vectorize again
- Fit Logistic and SVM to check accuracy

In [15]:
# step 3.
# use new train data(1568 tweets, including original and paraphrase)

new_test_data = y_test.to_frame().join(X_test.to_frame())
all_new_data = new_train_data.append(new_test_data)

X = all_new_data['clean_tweet']
y = all_new_data['Is_Unreliable']


In [16]:
# tfidf
tfidf = TfidfVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1))


X_dtm = tfidf.fit_transform(X)

In [17]:
# tfidf-logistic
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
scores = cross_val_score(logreg, X_dtm, y, cv= 10) # 10 folds validation
print( "tfidf Logistic Regression Accuracy with 10-folds validation: ", round(np.mean(scores),3))

a2 = ("After paraphrasing, tfidf Logistic Regression Accuracy: "+ str(round(np.mean(scores),3)))

tfidf Logistic Regression Accuracy with 10-folds validation:  0.839


In [18]:
# SVC
svc = SVC(probability=True)
pipe = Pipeline([
('vectorize', tfidf),
('classify', SVC())
])

In [19]:
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
# set up parameter grid
params = {
'classify__kernel': kernel,
'classify__C': C
}

In [20]:
inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 10, shuffle = True, random_state = 1)

# Set up GridSearch for inner loop
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

In [21]:
scores = cross_validate(grid_SVC,
                        X = X,
                        y = y,
                        cv = outer_cv,
                        scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                        return_estimator = True)

In [22]:
accuracy = scores['test_accuracy']
print(accuracy)
b2 = ("After paraphrasing, tfidf SVC Accuracy: " +str(round(accuracy.mean(),3)))
print("\n"+ b2)

[0.91269841 0.86904762 0.90079365 0.92460317 0.93253968 0.92063492
 0.91666667 0.92857143 0.92063492 0.88492063]

After paraphrasing, tfidf SVC Accuracy: 0.911


In [23]:
print(a)
print(a2 + "\n")

print(b)
print(b2 + "\n")

tfidf Logistic Regression Accuracy: 0.812
After paraphrasing, tfidf Logistic Regression Accuracy: 0.839

tfidf SVC Accuracy: 0.82
After paraphrasing, tfidf SVC Accuracy: 0.911



## Result

- Indeed, our accuracy get better after we parapharse our tweets to expand dataset. Logistic from 81.2% to 83.9%; SVM from 82% to 91%. However, if we want to get more better accuracy, we probably need to create new sentence with different structure. Because I only change words on my work, the structure is still same. If we change the words and structure to create new sentences, the training model can learn more information and get more improvement.