# Paraphrasing Sentences Optimize Classification Accuracy

## Programming Language: Python

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from random import choice
import pandas as pd
import numpy as np
import re
import html
import string
from sklearn.model_selection import cross_val_score

In [2]:
pd.options.mode.chained_assignment = None # deal with SettingWithCopyWarning

nltk.download('wordnet')
nltk.download('stopwords')
stopwords_nltk = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


Data = pd.read_csv("COVID19_Dataset.csv")
Data

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yurui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yurui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


## Preprocessing tweets 

- Remove useless words from tweets (Ex: url, digitis, punctuations, @__)
- Use <b>lemmatization</b> to returns an actual word of the language.

In [3]:
## preprocessing
Data['clean_tweet'] = Data['Tweet']

clean_text = ""
for i in range(0, Data['Tweet'].shape[0]):
    text = Data['Tweet'].iloc[i].lower() # lowercase
    text = re.sub(r'http\S+','', text) # remove url
    text = html.unescape(text) # convert XML to characters
    text = re.sub(r"[^A-z@]", " ", text) # remove digits,punctuations except @
    text = re.sub(r'@\S+','', text) # remove @__words
    text = ' '.join([token for token in text.split()])
    
   # clean stopwords
    tokenized_text = word_tokenize(text)
    tokenized_text_without_stopwords = [k for k in tokenized_text if not k in stopwords_nltk] 
   
    # lemmatization
    tokenized_text_without_stopwords_lem = [] 
    for j in range(0, len(tokenized_text_without_stopwords)):
        tokenized_text_without_stopwords_lem.append(lemmatizer.lemmatize(tokenized_text_without_stopwords[j]))
    
    tokenized_text_without_stopwords_lem = ' '.join(tokenized_text_without_stopwords_lem)
    clean_text = tokenized_text_without_stopwords_lem
    Data['clean_tweet'].iloc[i] = clean_text

## Fit on classification models

- Use <b>TF-IDF</b> to vectorize words.
- Fit on <b>Logistic Regression</b> and <b>SVM</b> to classify tweets.

In [4]:
# step 1.
import sklearn
from sklearn.model_selection import train_test_split

X = Data['clean_tweet']
y = Data['Is_Unreliable']

In [5]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1))

X_dtm = tfidf.fit_transform(X)

In [6]:
# tfidf-logistic
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression #import
from sklearn.metrics import classification_report, confusion_matrix

logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
scores = cross_val_score(logreg, X_dtm, y, cv= 10) # 10 folds validation
print( "tfidf Logistic Regression Accuracy with 10-folds validation: ", round(np.mean(scores),3))

a = ("tfidf Logistic Regression Accuracy: "+ str(round(np.mean(scores),3)))

tfidf Logistic Regression Accuracy with 10-folds validation:  0.812


- Pipeline for SVC
- Automated words vetorization & SVM to get accuracy.

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

svc = SVC(probability=True)
pipe = Pipeline([
('vectorize', tfidf),
('classify', SVC())
])

In [8]:
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
# set up parameter grid
params = {
'classify__kernel': kernel,
'classify__C': C
}

In [9]:
from sklearn.model_selection import cross_validate, KFold, GridSearchCV

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 10, shuffle = True, random_state = 1)

# Set up GridSearch for inner loop
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

In [10]:
scores = cross_validate(grid_SVC,
                        X = X,
                        y = y,
                        cv = outer_cv,
                        scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                        return_estimator = True)

In [11]:
accuracy = scores['test_accuracy']
print(accuracy)
b = ("tfidf SVC Accuracy: "+ str(round(accuracy.mean(),3)))
print("\n"+b)

[0.83928571 0.78571429 0.80357143 0.82142857 0.875      0.78571429
 0.76785714 0.89285714 0.82142857 0.80357143]

tfidf SVC Accuracy: 0.82


## Parapharse tweets

- paraphrase 5 sentences for each train data tweet to expand dataset.
- Use nltk module.
- Check synonyms words and replace original words to create new sentence.

In [12]:
# step 2.
## paraphrase 5 sentences for each train data tweet (560*0.7 + 560*0.7*5)##

X = Data['clean_tweet']
y = Data['Is_Unreliable']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size= 0.7, random_state=1) # 70%/ 30%

all_train_data = y_train.to_frame().join(X_train.to_frame()) # 392

new_train_data = all_train_data.append(all_train_data) # 392+392 = 784
new_train_data = new_train_data.append(new_train_data) # 784+784 = 1568
new_train_data = new_train_data.append(all_train_data) # 1568+392 = 1960
new_train_data = new_train_data.append(all_train_data) # 1960+392 = 2352

for i in range(0,len(all_train_data)):
     new_train_data.iloc[i*5] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+1] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+2] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+3] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+4] = all_train_data.iloc[i]
     new_train_data.iloc[i*5+5] = all_train_data.iloc[i]

In [13]:
# paraphrase def #

def tag(sentence): # tag each word's part of speech
 words = word_tokenize(sentence)
 words = pos_tag(words)
 return words

def paraphraseable(tag): 
 return tag.startswith('NN') or tag == 'VB' or tag.startswith('JJ')

def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

def synonyms(word, tag):
    lemma_lists = [ss.lemmas() for ss in wn.synsets(word, pos(tag))]
    lemmas = [lemma.name() for lemma in sum(lemma_lists, [])]
    return set(lemmas)

def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t):
    syns = synonyms(word, t)
    if syns:
     if len(syns) > 1:
      yield [word, list(syns)]
      continue
   yield [word, []]

def paraphrase(sentence):
 return [x for x in synonymIfExists(sentence)]

def generator(sentence):
    sentence = paraphrase(sentence)
    text = list(range(len(sentence)))
    for i in range(0, len(sentence)):
        if sentence[i][1] == []:
            text[i] = sentence[i][0]
        else:
            text[i] = choice(sentence[i][1])
        
    text = ' '.join(text)
    return text

In [14]:
# add paraphrasing 3 sentences for each tweet 
for i in range(0,len(all_train_data)):
    new_train_data.iloc[i*5+1,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+2,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+3,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+4,1] = generator(all_train_data.iloc[i,1])
    new_train_data.iloc[i*5+5,1] = generator(all_train_data.iloc[i,1])

## Test model accuracy again

- Use TF-IDF vectorize again
- Fit Logistic and SVM to check accuracy

In [15]:
# step 3.
# use new train data(1568 tweets, including original and paraphrase)

new_test_data = y_test.to_frame().join(X_test.to_frame())
all_new_data = new_train_data.append(new_test_data)

X = all_new_data['clean_tweet']
y = all_new_data['Is_Unreliable']


In [16]:
# tfidf
tfidf = TfidfVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1))


X_dtm = tfidf.fit_transform(X)

In [17]:
# tfidf-logistic
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
scores = cross_val_score(logreg, X_dtm, y, cv= 10) # 10 folds validation
print( "tfidf Logistic Regression Accuracy with 10-folds validation: ", round(np.mean(scores),3))

a2 = ("After paraphrasing, tfidf Logistic Regression Accuracy: "+ str(round(np.mean(scores),3)))

tfidf Logistic Regression Accuracy with 10-folds validation:  0.839


In [18]:
# SVC
svc = SVC(probability=True)
pipe = Pipeline([
('vectorize', tfidf),
('classify', SVC())
])

In [19]:
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
# set up parameter grid
params = {
'classify__kernel': kernel,
'classify__C': C
}

In [20]:
inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 10, shuffle = True, random_state = 1)

# Set up GridSearch for inner loop
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

In [21]:
scores = cross_validate(grid_SVC,
                        X = X,
                        y = y,
                        cv = outer_cv,
                        scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                        return_estimator = True)

In [22]:
accuracy = scores['test_accuracy']
print(accuracy)
b2 = ("After paraphrasing, tfidf SVC Accuracy: " +str(round(accuracy.mean(),3)))
print("\n"+ b2)

[0.91269841 0.86904762 0.90079365 0.92460317 0.93253968 0.92063492
 0.91666667 0.92857143 0.92063492 0.88492063]

After paraphrasing, tfidf SVC Accuracy: 0.911


In [23]:
print(a)
print(a2 + "\n")

print(b)
print(b2 + "\n")

tfidf Logistic Regression Accuracy: 0.812
After paraphrasing, tfidf Logistic Regression Accuracy: 0.839

tfidf SVC Accuracy: 0.82
After paraphrasing, tfidf SVC Accuracy: 0.911



## Result

- Indeed, our accuracy get better after we parapharse our tweets to expand dataset. Logistic from 81.2% to 83.9%; SVM from 82% to 91%. However, if we want to get more better accuracy, we probably need to create new sentence with different structure. Because I only change words on my work, the structure is still same. If we change the words and structure to create new sentences, the training model can learn more information and get more improvement.