<a href="https://colab.research.google.com/github/surajparui/2022_problem/blob/main/AI%20spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

WORD VECTOR REPRESENTATION

In [None]:
# Word vector representation for love
love = nlp(u'love')
print(love.vector.shape)
print(love.vector)

(96,)
[ 0.96288884  1.2634276   0.6250355  -0.77991855 -0.904034    1.219759
 -0.04371536 -0.1276063   0.24655849 -1.4692374  -0.63719076 -0.6365906
  0.09235445 -0.40780962 -0.71920305 -0.6152446  -0.49530357 -0.14899471
 -1.1089895  -0.16828349  2.1601627  -0.36894426  0.53562397  2.0515513
  1.3365848   0.81874454 -0.22138533  0.6860134   1.4870692  -0.39636394
 -0.16569182 -0.08968636 -1.3460407  -0.15887657 -0.76484084  1.6649683
 -0.2838282   0.4913479  -0.2769425   0.4877649  -0.1346015   2.509163
 -0.5847128   0.5567566  -0.91189206 -1.1060163   0.12433991  1.3289213
  0.18935159 -1.0610229  -0.02874377 -0.6999437  -1.3394148   0.24719048
 -0.7577665  -0.6644245  -2.0409827  -1.5102508   0.10508268 -0.12347171
 -0.48157108 -0.9322603  -0.74039656  0.01847911 -0.31598616  0.43642935
 -0.14901352  0.8811443  -0.15638724 -0.88410425  1.1666583  -0.423424
  0.13437171 -0.3457961  -0.10581809  0.48710424  1.375402   -0.30086493
 -0.5932542   1.3211955   0.13724011 -0.55722886  1.221

POS TAGGING

In [None]:
docs = nlp(u"I love it! Learning knew things with it eveyday! Still figuring out how everything works.")

for word in docs:
    print(word.text,word.pos_)

I PRON
love VERB
it PRON
! PUNCT
Learning PROPN
knew VERB
things NOUN
with ADP
it PRON
eveyday NOUN
! PUNCT
Still ADV
figuring VERB
out ADP
how SCONJ
everything PRON
works VERB
. PUNCT


In [None]:
# Loading TSV file
df_amazon = pd.read_csv("/content/amazon_alexa.tsv",sep="\t")

print(f'Shape of data: {df_amazon.shape}')
# Show top 5 records
df_amazon.head()

Shape of data: (3150, 5)


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


Data Information

In [None]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


Feedback Value Class Distribution

In [None]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

Tokenizing the Text

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# Create our list of punchuationmarks
punctuations = string.punctuation

# Create our list of stop words
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vector
parser = English()

# Creating our tokenzer function
def spacy_tokenizer(sentence): 

    tokens = sent_tokenize(sentence)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    lemmatized_tokens

    return lemmatized_tokens

Data Cleaning

In [None]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        return [clean_text(text) for text in X]
    
    def fit(self, X, y= None, **fit_params):
        return self
    
    def get_params(self, deep= True):
        return {}

# Basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()   

Feature Engineering

Vectorization

In [None]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))

TF-IDF

In [None]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

Create Train and Test Datasets

In [None]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # The features we want to analyse
ylabels = df_amazon['feedback'] # The labels, in this case feedback

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size = 0.3, random_state = 1)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (2205,)
y_train dimension: (2205,)
X_test dimension: (945,)
y_train dimension: (945,)


Creating a Pipeline and Generating the Model

In [None]:
# Logistic regression classifier
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter = 1000)

# Create pipeline using Bag of Words
pipe = Pipeline ([("cleaner", predictors()),
                 ("vectorizer", bow_vector),
                 ("classifier", classifier)])

# Model generation
pipe.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f154d0bb4d0>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7f154e5fba70>)),
                ('classifier', LogisticRegression(max_iter=1000))])

In [None]:
X_train 
y_train

1433    1
2833    1
1807    1
1447    0
1328    1
       ..
2763    1
905     1
1096    1
235     1
1061    1
Name: feedback, Length: 2205, dtype: int64

In [None]:
from sklearn import metrics

# Predicting with test dataset
predicted = pipe.predict(X_test)

# Model accuracy score
print(f'Logistic Regression Accuracy: {metrics.accuracy_score(y_test, predicted)}')
print(f'Logistic Regression Precision: {metrics.precision_score(y_test, predicted)}')
print(f'Logistic Regression Recall: {metrics.recall_score(y_test, predicted)}')

Logistic Regression Accuracy: 0.9195767195767196
Logistic Regression Precision: 0.9298056155507559
Logistic Regression Recall: 0.9873853211009175
