# Data Preprocessing

Evaluating effectiveness of data preprocessing techniques
* before preprocesing
* after preprocessing

## Install and import required libraries

In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
import time
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Before preprocessing

In [3]:
df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [4]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [5]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [6]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [7]:
df = df[['Text','manual_label']]

df.head()

Unnamed: 0,Text,manual_label
0,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,1
1,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,1
2,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,1
3,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,1
4,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,1


In [8]:
df.isnull().values.any()

False

In [11]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [12]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [13]:
train_vectors = vectorizer.fit_transform(train_df['Text'])
test_vectors = vectorizer.transform(test_df['Text'])

In [14]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(train_vectors, train_df['manual_label'])
t1 = time.time()

prediction_linear = XGB_classifier.predict(test_vectors)

time_linear_train = t1-t0

In [15]:
print("Training time: %fs" % (time_linear_train))

Training time: 9.467592s


In [16]:
acc_score = accuracy_score(test_df['manual_label'], prediction_linear)

print("accuracy: ", str(acc_score))

accuracy:  0.8225


In [17]:
print(classification_report(test_df['manual_label'], prediction_linear))

              precision    recall  f1-score   support

           0       0.67      0.19      0.30        21
           1       0.88      0.87      0.88       269
           2       0.70      0.82      0.76       110

    accuracy                           0.82       400
   macro avg       0.75      0.63      0.64       400
weighted avg       0.82      0.82      0.81       400



## After preprocessing

In [20]:
df = pd.read_csv('/content/drive/MyDrive/cz4034 IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [21]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [22]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [23]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [24]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [25]:
df.isnull().values.any()

False

In [28]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [29]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [30]:
train_vectors = vectorizer.fit_transform(train_df['clean_text'])
test_vectors = vectorizer.transform(test_df['clean_text'])

In [31]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(train_vectors, train_df['manual_label'])
t1 = time.time()

prediction_linear = XGB_classifier.predict(test_vectors)

time_linear_train = t1-t0

In [32]:
print("Training time: %fs" % (time_linear_train))

Training time: 1.733507s


In [33]:
acc_score = accuracy_score(test_df['manual_label'], prediction_linear)

print("accuracy: ", str(acc_score))

accuracy:  0.835


In [34]:
print(classification_report(test_df['manual_label'], prediction_linear))

              precision    recall  f1-score   support

           0       0.62      0.29      0.40        17
           1       0.91      0.89      0.90       291
           2       0.66      0.77      0.71        92

    accuracy                           0.83       400
   macro avg       0.73      0.65      0.67       400
weighted avg       0.84      0.83      0.83       400

