In [81]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install flask
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.1-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting cryptography>=36.0.0 (from scrapy)
  Downloading cryptography-42.0.5-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.1.0-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting pyOpenSSL>=21.0.0 (from scrapy)
  Downloading pyOpenSSL-24.1.0-py3-none-any.whl.metadata (12 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.6.2-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting w3lib>=1.17

In [1]:
import numpy as np 
import pandas as pd 
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib


In [2]:
dataframe = pd.read_csv('fakeReviewData.csv') 
dataframe.head() 

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [3]:
dataframe.dropna(inplace=True) 
dataframe['length'] = dataframe['text_'].apply(len) 
dataframe[dataframe['label']=='OR'][['text_','length']].sort_values(by='length',ascending=False).head().iloc[0].text_ 

'WEAK ON CURRENT SCIENCE.\nAfter seeing it twice, I agree with much (but not all) of the positive five star reviews. Out of respect for those who READ reviews, I\'ll not repeat everything that I like about the presentation. I found the goofy oversize earrings, hairdo, and facial hair arrangement of Daniel Vitalis, (described as a "Wild Food Expert") distracting. UGH. Ditto for David Wolfe, who had an extremely goofy wild hairdo. On the other hand, Jon Gabriel, described as an "author and weight loss expert" was nicely groomed and a good presenter. His story of personal transformation of a fellow of over 400 pounds (whew) to becoming a jock of normal weight was inspiring. Christiane Northrup preserves her rank as one of America\'s cutest doctors. A really nice looking woman! Presentations by Dr. Mercola, Jason Vale, Kris Carr, Alejandro Junger were fine. It was disappointing to have Jamie Oliver (so popular in the UK) give Baby Cow Growth Fluid a pass with unscientific but popular ideas

In [4]:
def convertmyTxt(rv): 
    np = [c for c in rv if c not in string.punctuation] 
    np = ''.join(np) 
    return [w for w in np.split() if w.lower() not in stopwords.words('english')] 

In [5]:
x_train, x_test, y_train, y_test = train_test_split(dataframe['text_'],dataframe['label'],test_size=0.25)

In [6]:
pip = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
]) 

In [17]:
#only run the first time
#nltk.download('stopwords')

In [7]:
pip.fit(x_train,y_train) 
filename = 'random_forest_model.pkl'
joblib.dump(pip, filename)

['random_forest_model.pkl']

In [8]:
randomForestClassifier = pip.predict(x_test) 
print(randomForestClassifier)

['OR' 'OR' 'OR' ... 'CG' 'OR' 'CG']


In [9]:
print('Accuracy of the model: ',str(np.round(accuracy_score(y_test,randomForestClassifier)*100,2)) + '%')

Accuracy of the model:  85.65%


In [21]:
pip = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [22]:
pip.fit(x_train,y_train)
filename = 'support_vector_classifier.pkl'
joblib.dump(pip, filename)

['support_vector_classifier.pkl']

In [23]:
supportVectorClassifier = pip.predict(x_test)
supportVectorClassifier

array(['OR', 'CG', 'OR', ..., 'CG', 'CG', 'OR'], dtype=object)

In [24]:
print('accuracy of the model:',str(np.round(accuracy_score(y_test,supportVectorClassifier)*100,2)) + '%')

accuracy of the model: 90.34%


In [26]:
pip = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',LogisticRegression())
])

In [27]:
pip.fit(x_train,y_train)
filename = 'logisticRegression.pkl'
joblib.dump(pip, filename)

['logisticRegression.pkl']

In [28]:
logisticRegression = pip.predict(x_test)#here we are predicting the accuracy of the Random Forest Classifier model
logisticRegression

array(['OR', 'CG', 'OR', ..., 'CG', 'CG', 'OR'], dtype=object)

In [29]:
print('accuracy of the model:',str(np.round(accuracy_score(y_test,logisticRegression)*100,2)) + '%')#here we are predicting the accuracy of the Random Forest Classifier model

accuracy of the model: 88.97%


In [64]:
ans = pip.predict(["This is an awsome product"])

In [65]:
ans

array(['OR'], dtype=object)

In [91]:
loaded_model = joblib.load(".\\models\\randomForest.pkl")

In [73]:
ans = loaded_model.predict(["This is an awsome product"])
ans

array(['OR'], dtype=object)

In [92]:
loaded_model = joblib.load(".\\models\\SVC.pkl")

In [101]:
!pip3 install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting webencodings (from html5lib)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   ---------- ----------------------------- 30.7/112.2 kB 1.3 MB/s eta 0:00:01
   --------------------- ----------------- 61.4/112.2 kB 812.7 kB/s eta 0:00:01
   ---------------------------------- --- 102.4/112.2 kB 837.8 kB/s eta 0:00:01
   -------------------------------------- 112.2/112.2 kB 815.5 kB/s eta 0:00:00
Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: webencodings, html5lib
Successfully installed html5lib-1.1 webencodings-0.5.1


In [75]:
ans = loaded_model.predict(["This is an awsome product"])
ans

array(['OR'], dtype=object)

In [93]:
loaded_model = joblib.load(".\\models\\logisticRegression.pkl")

In [99]:
ans = loaded_model.predict(["This is an awsome product","Pesa vasul 👌"])
ans

array(['OR', 'OR'], dtype=object)

In [82]:
import scrapy
from urllib.parse import urljoin

class AmazonReviewsSpider(scrapy.Spider):
    name = "amazon_reviews"

    def start_requests(self):
        asin_list = ['B09G9FPHY6']
        for asin in asin_list:
            amazon_reviews_url = f'https://www.amazon.com/product-reviews/{asin}/'
            yield scrapy.Request(url=amazon_reviews_url, callback=self.parse_reviews, meta={'asin': asin})

    def parse_reviews(self, response):
        asin = response.meta['asin']
        
        ## Parse Product Reviews
        review_elements = response.css("#cm_cr-review_list div.review")
        for review_element in review_elements:
            yield {
                    "asin": asin,
                    "text": "".join(review_element.css("span[data-hook=review-body] ::text").getall()).strip(),
                    "title": review_element.css("*[data-hook=review-title]>span::text").get(),
                    "location_and_date": review_element.css("span[data-hook=review-date] ::text").get(),
                    "verified": bool(review_element.css("span[data-hook=avp-badge] ::text").get()),
                    "rating": review_element.css("*[data-hook*=review-star-rating] ::text").re(r"(\d+\.*\d*) out")[0],
                    }


In [86]:
!scrapy runspider amazon_reviews 

2024-04-05 17:54:08 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-04-05 17:54:08 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Windows-11-10.0.22621-SP0
Usage
=====
  scrapy runspider [options] <spider_file>
runspider: error: File not found: amazon_reviews



In [90]:
!scrapy runspider amazonScrapper.py -a asin_list="B0744NSM3N"

2024-04-05 18:44:21 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-04-05 18:44:22 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Windows-11-10.0.22621-SP0
2024-04-05 18:44:22 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-04-05 18:44:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-04-05 18:44:22 [scrapy.extensions.telnet] INFO: Telnet Password: 02d4df9568474026
2024-04-05 18:44:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.log

In [95]:
!pip install "lxml>=4.6.0" "requests_html>=0.10.0"

