# Data Augmentation
Creating train data using data augmentation
* (Stacked) Synonym + Embedding

## Install and import required libraries

In [None]:
!pip install nlpaug
!pip install textattack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textattack
  Downloading textattack-0.3.8-py3-none-any.whl (418 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.7/418.7 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting click<8.1.0
  Downloading click-8.0.4-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━

In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
from textattack.augmentation import EmbeddingAugmenter

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Read dataset

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


## Original train data

In [None]:
df['manual_label'].value_counts()

pos    1374
neu     517
neg     109
Name: manual_label, dtype: int64

In [None]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,pos
1,mutant ape yacht club bought floor h chg floor...,pos
2,mutant ape yacht club mayc nft sold eth k,pos
3,mutant ape yacht club mayc nft sold eth k,pos
4,mutant ape yacht club sold eth nft collection ...,pos


In [None]:
df.isnull().values.any()

False

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [None]:
y_train.value_counts()

pos    1097
neu     415
neg      88
Name: manual_label, dtype: int64

In [None]:
train_init = {'clean_text': X_train, 'manual_label': y_train}

In [None]:
train_init_df = pd.DataFrame(train_init)

In [None]:
train_init_df.head()

Unnamed: 0,clean_text,manual_label
968,meebit bought eth usd blur meebits meebitsnft,pos
240,check tyler nft weekly preview analyzes market...,neu
819,meebits triple floor check opensea listing flo...,neu
692,clonex sold eth previously sold eth nft collec...,pos
420,mutant ape yacht club sold weth previously sol...,pos


In [None]:
train_init_df.to_csv('train_init.csv', index=False)

In [None]:
check_train_init_df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/train_init.csv')

check_train_init_df.head()

Unnamed: 0,clean_text,manual_label
0,meebit bought eth usd blur meebits meebitsnft,pos
1,check tyler nft weekly preview analyzes market...,neu
2,meebits triple floor check opensea listing flo...,neu
3,clonex sold eth previously sold eth nft collec...,pos
4,mutant ape yacht club sold weth previously sol...,pos


In [None]:
check_train_init_df.shape

(1600, 2)

## Test data

In [66]:
test_data = {'clean_text': X_test, 'manual_label': y_test}

In [67]:
test_df = pd.DataFrame(test_data)

In [68]:
test_df.head()

Unnamed: 0,clean_text,manual_label
1860,day mapo tofu healthy meal mean tofu got ta co...,neu
353,azuki room magicwin cryptochazman yasirali nft...,neu
1333,nft lending agreement benddao ethereum reserve...,pos
905,meebit bought eth usd blur meebits meebitsnft,pos
1289,sold punksticker new owner thanks enjoy nftcol...,pos


In [72]:
test_df.to_csv('test.csv', index=False)

In [74]:
check_test_df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/test.csv')

check_test_df.head()

Unnamed: 0,clean_text,manual_label
0,day mapo tofu healthy meal mean tofu got ta co...,neu
1,azuki room magicwin cryptochazman yasirali nft...,neu
2,nft lending agreement benddao ethereum reserve...,pos
3,meebit bought eth usd blur meebits meebitsnft,pos
4,sold punksticker new owner thanks enjoy nftcol...,pos


In [75]:
check_test_df.shape

(400, 2)

## Augmented train data

In [None]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3)

In [None]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]=='neg':
    temps1=aug.augment(X_train[i],n=3)
    for sent in temps1:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append('neg')

In [None]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(1864,)
(1864,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [None]:
y_train.value_counts()

pos    1097
neu     415
neg     352
dtype: int64

In [None]:
embed_aug = EmbeddingAugmenter()

In [None]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]=='neg':
    temps3=embed_aug.augment(X_train[i])
    for sent in temps3:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append('neg')

In [None]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(2216,)
(2216,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [None]:
y_train.value_counts()

pos    1097
neg     704
neu     415
dtype: int64

In [None]:
train_aug = {'clean_text': X_train, 'manual_label': y_train}

In [None]:
train_aug_df = pd.DataFrame(train_aug)

In [None]:
train_aug_df.head()

Unnamed: 0,clean_text,manual_label
0,meebit bought eth usd blur meebits meebitsnft,pos
1,check tyler nft weekly preview analyzes market...,neu
2,meebits triple floor check opensea listing flo...,neu
3,clonex sold eth previously sold eth nft collec...,pos
4,mutant ape yacht club sold weth previously sol...,pos


In [None]:
train_aug_df.to_csv('train_aug.csv', index=False)

In [None]:
check_train_aug_df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/train_aug.csv')

check_train_aug_df.head()

Unnamed: 0,clean_text,manual_label
0,meebit bought eth usd blur meebits meebitsnft,pos
1,check tyler nft weekly preview analyzes market...,neu
2,meebits triple floor check opensea listing flo...,neu
3,clonex sold eth previously sold eth nft collec...,pos
4,mutant ape yacht club sold weth previously sol...,pos


In [None]:
check_train_aug_df.shape

(2216, 2)