In [None]:
import scrapy
from urllib.parse import urljoin
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector 


class RatingSpider(scrapy.Spider):
    name = "rate"
#     start_urls = ["https://www.imdb.com/title/tt0038166/reviews"]
    
    def __init__(self, urls, **kwargs):
        self.start_urls = urls
        super().__init__(**kwargs)  # python3
        
    def start_requests(self):
        # To track page id
        for url in self.start_urls:
            yield scrapy.Request(url, meta={'orig_url': url})

    def parse(self, response):
#         ratings = response.xpath("//div[@class='ipl-ratings-bar']//span[@class='rating-other-user-rating']//span[not(contains(@class, 'point-scale'))]/text()").getall()        
        texts = response.xpath("//div[@class='text show-more__control']")
    
        print(response.meta)
        print(response.text)
        
        try:
            title = response.xpath("//meta[@name='title']/@content")[0].extract()
        except:
            title = response.meta['title']
        print(title)
        
        for text in texts:
            text = text.extract() 
            text = Selector(text=text) 
            text = text.xpath("//div[@class='text show-more__control']/text()").extract() 
            yield {
                "title": title,
                "text": text
            }
        
        key = response.css("div.load-more-data::attr(data-key)").get()
        orig_url = response.meta.get('orig_url', response.url)
        next_url = urljoin(orig_url, "reviews/_ajax?paginationKey={}".format(key))        
                
        if key:
            yield scrapy.Request(next_url, meta={'orig_url': orig_url, "title": title})

In [3]:
process = CrawlerProcess(settings={
    'LOG_ENABLED': False,
    "FEEDS": {
        "items.json": {"format": "json"},
    },
})

process.crawl(RatingSpider, urls=["https://www.imdb.com/title/tt0038166/reviews"])
process.start() # the script will block here until the crawling is finished

In [8]:
import json

In [9]:
with open("../data/lord_reviews.json", "r") as f:
    result = json.load(f)

In [10]:
len(result)

1

In [11]:
result.keys()

dict_keys(['The Lord of the Rings: The Fellowship of the Ring'])

In [13]:
type(result['The Lord of the Rings: The Fellowship of the Ring'])

list

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [53]:
count = CountVectorizer(stop_words='english')

In [54]:
count.fit(result['The Lord of the Rings: The Fellowship of the Ring'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [55]:
response = count.transform(result['The Lord of the Rings: The Fellowship of the Ring'])

In [82]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer(stop_words="english").fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, int(sum_words[0, idx])) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [61]:
count = get_top_n_words(result['The Lord of the Rings: The Fellowship of the Ring'], n=100)

In [64]:
with open("../data/spyreviews.json") as f:
    movie_reviews = json.load(f)


In [67]:
len(movie_reviews['My Spy'])

267

In [83]:
title = list(movie_reviews.keys())[0]
count = get_top_n_words(movie_reviews[title], n=2000)


In [87]:
type(count[0][1])

int

In [74]:
for i in count:
    if len(i[0])> 15:
        print(i)

In [75]:
count = {title: count}

In [88]:
with open(f'../data/spy_count.json', 'w') as f:
    json.dump(count, f)

In [42]:
tfidf = TfidfVectorizer(stop_words='english')

In [43]:
tfidf.fit(result['The Lord of the Rings: The Fellowship of the Ring'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [44]:
response = tfidf.transform(result['The Lord of the Rings: The Fellowship of the Ring'])

In [45]:
feature_array = tfidf.get_feature_names()

In [35]:
np.argmin(tfidf.idf_)

13954

In [49]:
feature_array[13954]

'monotonous'

In [50]:
sorted_items=sort_coo(response.tocoo())

In [51]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [52]:
for i, j in sorted_items[:20]:
    print(feature_array[i])

sam
walk
rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
awesome
10
gratest
chidhood
walk
10
100
definitly
10
figh
3hour
greta
dead
00s
favourites
engaging
verion


In [None]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]

In [None]:

t = """Two Travellers, walking in the noonday sun, sought the shade of a widespreading tree to rest. As they lay looking up among the pleasant leaves, they saw that it was a Plane Tree.

"How useless is the Plane!" said one of them. "It bears no fruit whatever, and only serves to litter the ground with leaves."

"Ungrateful creatures!" said a voice from the Plane Tree. "You lie here in my cooling shade, and yet you say I am useless! Thus ungratefully, O Jupiter, do men receive their blessings!"

Our best blessings are often the least appreciated."""

tfs = tfidf.fit_transform(t.split(" "))
str = 'tree cat travellers fruit jupiter'
response = tfidf.transform([str])
feature_names = tfidf.get_feature_names()