In [288]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [289]:
import pandas as pd
df = pd.read_csv('https://github.com/FadlyHaikal/ReviewDataset/blob/main/Reviews.csv?raw=true')
df.head()

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [290]:
label = df['Score'].copy()
label.replace({1:0,2:0,3:1,4:1,5:1}, inplace=True)
df["Target"] = label
df.tail(10)
df.drop(['Score', 'Text'], axis=1, inplace=True)

In [291]:
df.isna().sum()

Summary    0
Target     0
dtype: int64

#Data Cleaning

In [292]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create List of punctuation marks
punctuations = string.punctuation

# Create out list of Stopwords
nlp = spacy.blank('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#load English Tokenizer
parser = English()

# Creating Tokenizer Func
def spacy_tokenizer(sentence):
  #Create token object
  mytokens = parser(sentence)

  #Lemmatizing each token and covert to lowercase
  mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

  #Removing Stop words
  mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations ]

  return mytokens

In [293]:
nlp

<spacy.lang.en.English at 0x7ff881691fd0>

In [294]:
from nltk.tokenize import word_tokenize

def nltk_tokenizer(text):
  tokens = word_tokenize(text)
  # convert to lower case
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  import string
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  from nltk.corpus import stopwords
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  
  return words

In [321]:
sen = "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most."

In [322]:
nltk_tokenizer(sen)

['bought',
 'several',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'found',
 'good',
 'quality',
 'product',
 'looks',
 'like',
 'stew',
 'processed',
 'meat',
 'smells',
 'better',
 'labrador',
 'finicky',
 'appreciates',
 'product',
 'better']

#Defining a Transformer


In [295]:
from sklearn.base import TransformerMixin

# This function will clean the text 
def clean_text(text):
  return text.strip().lower()

#Custom transformer using Python standard library (you could use spacy as well) 
class predictors(TransformerMixin):

  def transform(self, x, **transform_params):
    return [clean_text(text) for text in X]

  def fit(self, x, y=None, **fit_params) :
    return self
    
  def get_params (self, deep=True):
    return {}

In [323]:
clean_text(sen)

'i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.'

#Feature Engineering

###Bag of Words

In [296]:
from sklearn.feature_extraction.text import CountVectorizer
#vectorizer
#We create our bag of words (bow) using our tokenltk_tokenizer, nizer and defining an ngram range 
bow = CountVectorizer(tokenizer =nltk_tokenizer, ngram_range=(1,1))

###TF-IDF

In [297]:
from sklearn.feature_extraction.text import TfidfVectorizer

#tfidf
tfvectorizer = TfidfVectorizer(tokenizer = nltk_tokenizer)

#Train and Test Split

In [298]:
X = df['Summary']
ylabels = df['Target']

In [301]:
print(X)

0                    Good Quality Dog Food
1                        Not as Advertised
2                    "Delight" says it all
3                           Cough Medicine
4                              Great taffy
                       ...                
6437                   Will not do without
6438                          disappointed
6439              Perfect for our maltipoo
6440    Favorite Training and reward treat
6441                           Great Honey
Name: Summary, Length: 6442, dtype: object


In [300]:
print(ylabels)

0       1
1       0
2       1
3       0
4       1
       ..
6437    1
6438    0
6439    1
6440    1
6441    1
Name: Target, Length: 6442, dtype: int64


In [302]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [314]:
X_train.shape

(5153,)

In [313]:
y_train.shape

(5153,)

#Classifier

In [309]:
from sklearn.neural_network import MLPClassifier
#classifier
classifier_MLP = MLPClassifier(max_iter=400, hidden_layer_sizes=(100,2), verbose = True)

#Create Pipeline

In [315]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('cleaner', predictors()), 
                 ('vectorizer', tfvectorizer), 
                 ('classifier', classifier_MLP)], verbose=True)

In [316]:
#fit data
pipe.fit(X_train, y_train)

[Pipeline] ........... (step 1 of 3) Processing cleaner, total=   0.0s
[Pipeline] ........ (step 2 of 3) Processing vectorizer, total=   3.0s


ValueError: ignored

#Evaluate

In [None]:
sample_prediction = pipe.predict(X_test)

In [None]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, sample_prediction)
precision = metrics.precision_score(y_test, sample_prediction)
recall = metrics.recall_score(y_test, sample_prediction)

In [None]:
!pip install -q gradio
!pip install -q transformers

In [None]:
import numpy as np
import pandas as pd

In [None]:
import gradio as gr
from transformers import pipeline

sentiment=pipeline("sentiment-analysis")

def get_sentiment(text):
  txt_split = text.split(";")
  data = {'text': [], 'label': [], 'score': []}
  for t in txt_split:
    res = sentiment(t)
    data["text"].append(t)
    data["label"].append(res[0]['label'])
    data["score"].append(res[0]['score'])

  df = pd.DataFrame(data)
  res = df.groupby('label')['text'].count() / df['label'].count()
  return (res.to_dict(), df)

In [None]:
iface = gr.Interface(fn=get_sentiment,
                    inputs="text",
                    allow_flagging="never",
                    outputs = ["label", "dataframe"],
                    title="Sentiment Analysis",
                    description='Give the sentiment analysis')

In [None]:
iface.launch(debug=True)

That's all! Go ahead and open that share link in a new tab. Check out our [getting started](https://gradio.app/getting_started.html) page for more complicated demos.