## Imports and downloads

In [1]:
!pip install pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25ldone
[?25h  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16672 sha256=fb00082dc4fe1d2a2a8d34eb7123db7b7f5e97a385c8ef3ec33cf7f84735a864
  Stored in directory: /root/.cache/pip/wheels/50/4f/1e/34e057bb868842209f1623f195b74fd7eda229308a7352d47f
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.5


In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score,roc_curve
import nltk
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC, SVC
from pandarallel import pandarallel
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')

import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')
    

print('Imports done.')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip
Imports done.


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [3]:
electronics_dataset = pd.read_csv('../input/amazon-reviews-2018-electronics/labeled_electronics_dataset.csv')

electronics_dataset.head()

Unnamed: 0,overall,vote,reviewTime,reviewText,summary,Label
0,2,0,2010-02-10,Tech support is the worst,1265760000,NEGATIVE
1,2,0,2016-10-24,Screws were missing from the bracket and beaut...,Spend a little more and get much better.,NEGATIVE
2,1,0,2017-07-10,Trouble connecting and staying connected via b...,1499644800,NEGATIVE
3,4,5,2013-05-02,I purchased this unit for our RV to replace an...,Receiver Offers a Lot of Flexibility & Complexity,POSITIVE
4,3,0,2013-01-04,It works. Nuff said but the review requires 1...,It's a cable,NEUTRAL


## Data pre-processing

In [4]:
# Check for NaN values
print("NaN (before cleanup) ?: \n", electronics_dataset.isnull().sum())

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].fillna('')

print("NaN (after cleanup) ?: \n", electronics_dataset.isnull().sum())

NaN (before cleanup) ?: 
 overall       0
vote          0
reviewTime    0
reviewText    1
summary       0
Label         0
dtype: int64
NaN (after cleanup) ?: 
 overall       0
vote          0
reviewTime    0
reviewText    0
summary       0
Label         0
dtype: int64


In [7]:
# Text preprocessing for reviewText column
# Lower all text

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].str.lower()

# Initialize pandarallel
# I used pandarallel because it applies the functions much faster than a normal pandas apply.
pandarallel.initialize(nb_workers=4,progress_bar=True)

# Remove all special characters
def remove_special_chars(text):
    return ''.join(x if x.isalnum() else ' ' for x in text)

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_special_chars)

# get stopwords.
stop_words = set(stopwords.words('english'))

# Remove stop_words
def remove_stopwords(text):
    words = word_tokenize(text)
    return [x for x in words if x not in stop_words]


electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_stopwords)

# Lemmatization
def lemmatize_word(text):
    wordnet = WordNetLemmatizer()
    return " ".join([wordnet.lemmatize(word) for word in text])

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(lemmatize_word)

# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_numbers)

print('Example of preprocessing train: ')
print(electronics_dataset['reviewText'][0])

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4953), Label(value='0 / 4953'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4953), Label(value='0 / 4953'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4953), Label(value='0 / 4953'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4953), Label(value='0 / 4953'))), …

Example of preprocessing train: 
tech support worst


In [8]:
X = electronics_dataset['reviewText']
y = electronics_dataset['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 80% training data, 20% testing data

hashing_vectorizer = HashingVectorizer()
X_train_vectorized = hashing_vectorizer.fit_transform(X_train)
X_test_vectorized = hashing_vectorizer.transform(X_test)

print('Preprocessing complete.')

Preprocessing complete.


## Model creation and evaluation

In [9]:
# Support Vector Machine (SVM)

svm_model = LinearSVC(random_state = 42, max_iter=1000)

# Train the SVM model
svm_model.fit(X_train_vectorized, y_train)

# Obtain predictions
y_pred = svm_model.predict(X_test_vectorized)

# Print classification report for each label
print("====================== SVM Classification Report ======================")
print("\n")
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

    NEGATIVE       0.68      0.75      0.72      1575
     NEUTRAL       0.45      0.23      0.30       808
    POSITIVE       0.68      0.78      0.73      1579

    accuracy                           0.66      3962
   macro avg       0.61      0.59      0.58      3962
weighted avg       0.63      0.66      0.64      3962

