# Large Movie Review Dataset

The Large Movie Review Dataset is a corpus of 50,000 movie reviews from IMDB that have been classified as either positive or negative. More information about the dataset can be found at https://ai.stanford.edu/~amaas/data/sentiment/. 

I am going to create a tokenizer on sentiment score based on the words used in the movie reviews, so we can analysis whether the reviews are positive or negative.

In [None]:
import urllib.request, json 
imdb_corpus = []
with urllib.request.urlopen("https://storage.googleapis.com/wd13/IMDBReviewSent.txt") as url:
  for line in url.readlines():
    imdb_corpus.append(line.decode().split('\t'))

In [None]:
# print the text and label of document 16
docid = 16
print(imdb_corpus[docid])

['positive', "Some films just simply should not be remade. This is one of them. In and of itself it is not a bad film. But it fails to capture the flavor and the terror of the 1963 film of the same title. Liam Neeson was excellent as he always is, and most of the cast holds up, with the exception of Owen Wilson, who just did not bring the right feel to the character of Luke. But the major fault with this version is that it strayed too far from the Shirley Jackson story in it's attempts to be grandiose and lost some of the thrill of the earlier film in a trade off for snazzier special effects. Again I will say that in and of itself it is not a bad film. But you will enjoy the friction of terror in the older version much more.\n"]


In [None]:
# print the label of document 16
docid = 16
print(imdb_corpus[docid][0])

positive


In [None]:
# print the text of document 16
docid = 16
print(imdb_corpus[docid][1])

Some films just simply should not be remade. This is one of them. In and of itself it is not a bad film. But it fails to capture the flavor and the terror of the 1963 film of the same title. Liam Neeson was excellent as he always is, and most of the cast holds up, with the exception of Owen Wilson, who just did not bring the right feel to the character of Luke. But the major fault with this version is that it strayed too far from the Shirley Jackson story in it's attempts to be grandiose and lost some of the thrill of the earlier film in a trade off for snazzier special effects. Again I will say that in and of itself it is not a bad film. But you will enjoy the friction of terror in the older version much more.



# Create a tokenizer

In [None]:
import re
def tokenize(doc):
  emoti_list = [':)','(:',':(','):',':D','D:',':P','P:',':V','V:',':/','/:',':\\','\\:',':|','|:',
                ';)','(;',';(',');',';D','D;',';P','P;',';V','V;',';/','/;',';\\','\\;',';|','|;',
                ':-)','(-:',':-(',')-:',':-D','D-:',':-P','P-:',':-V','V-:',':-/','/-:',':-\\',
                '\\-:',':-|','|-:',';-)','(-;',';-(',')-;',';-D','D-;',';-P','P-;',';-V','V-;',
                ';-/','/-;',';-\\','\\-;',';-|','|-;']
  tokenizer_pattern = re.compile('|'.join([
      '|'.join([re.escape(e) for e in emoti_list]),
      "[A-Za-z]+(?:['-_\.][A-Za-z]+)?",
      '\.\.+'
      ]))
  tokens = tokenizer_pattern.findall(doc)
  for i in range(0,len(tokens)):
    if re.match('\.\.+',tokens[i]):
      tokens[i] = '..+'
    else:
      tokens[i] = tokens[i].lower()
  return(tokens)

In [None]:
# Import log function
from math import log
log(1)

0.0

# Create a lexicon

In [None]:
#Calculate sentiment scores for every token in the corpus
positive_count = 0
negative_count = 0

token_positive_count = {}
token_negative_count = {}

unique_tokens = set()

for doc in imdb_corpus:

  label = doc[0]
  tokens = tokenize(doc[1])

  if label=='positive':
    positive_count += 1
  else:
    negative_count += 1

  for token in set(tokens):
    unique_tokens.add(token)
    if label=='positive':
      if token not in token_positive_count:
        token_positive_count[token] = 0
      token_positive_count[token] += 1
    else:
      if token not in token_negative_count:
        token_negative_count[token] = 0
      token_negative_count[token] += 1


lexicon = {}

for token in lexicon:
  if token not in token_positive_count or token not in token_negative_count:
    continue
  lexicon[token] = log((token_positive_count[token]/positive_count)/(token_negative_count[token]/negative_count))

In [None]:
 #Create a score message function
 def score_message(doc):
  score = 0
  for token in set(tokenize(doc)):
    if token in lexicon:
      score += lexicon[token]
  return(score)

# Live Twitter Data Dataset



> I have downloaded the live tweets data by Twitter API bearer token through Twitter official. I am going to extract the recent 1000 tweets related to Netflix & Disney plus, create a tokenizer to calculate the sentiment score of the tweets, in order to generate the summary statistics.



In [2]:
from google.colab import files
uploaded_files = files.upload()
twitter_bearer_token = uploaded_files['twitter_bearer_token.txt'].decode()

Saving twitter_bearer_token.txt to twitter_bearer_token.txt


In [17]:
#query your search query
#bearer_token your Twitter API bearer token
#number_of_tweets the number of tweets you want to return
import requests
def get_tweets(query,bearer_token,number_of_tweets):
  tweets = []
  next_token = None
  while len(tweets)<number_of_tweets:
    response = requests.get(
        url = 'https://api.twitter.com/2/tweets/search/recent',
        params = {
          'query':query,
          'next_token':next_token},
        headers = {'authorization' : 'bearer '+bearer_token} 
        )
    response_json = response.json()
    for tweet in response_json['data']:
      tweets.append(tweet)
    if 'next_token' not in response_json['meta']:
      break
    next_token = response_json['meta']['next_token']
  return(tweets)

In [18]:
tweets = get_tweets('NETFLIX OR DISNEY+',twitter_bearer_token,100)

KeyError: ignored

In [None]:
tweets

In [None]:
import json
with open('tweets.json','w') as f:
  f.write(json.dumps(tweets))
files.download('tweets.json')

In [None]:
import json
from google.colab import files
uploaded_files = files.upload()
tweets = json.loads(uploaded_files['tweets.json'])

In [None]:
tweets[0]

{'edit_history_tweet_ids': ['1623404827190476801'],
 'id': '1623404827190476801',
 'text': "RT @ToughPigs: Today on ToughPigs, we're sharing the incredible Muppet fan art of @KOMakesThings! Featuring Miss Piggy as a bunch of Disney…"}

# Analyze Twitter Data

In [None]:
#install lexicon

from google.colab import files
uploaded_files = files.upload()
lexicon_file = uploaded_files['lexicon.txt'].decode()
lexicon = {}
for line in lexicon_file.split('\n'): 
  split_line = line.split('\t')
  token = split_line[0]
  score = float(split_line[1])
  lexicon[token] = score 


Saving lexicon.txt to lexicon.txt


In [None]:
#sentiment score 
def sentiment_score(tweets):
  score = 0
  for token in set(tokenize(tweets)):
    if token in lexicon:
      score += lexicon[token]
  return(score)

In [None]:
Tweet_live_text = [tweet['text'] for tweet in tweets]

In [None]:
Tweet_live_text[0]

"RT @ToughPigs: Today on ToughPigs, we're sharing the incredible Muppet fan art of @KOMakesThings! Featuring Miss Piggy as a bunch of Disney…"

In [None]:
netflix_total_count = 0
netflix_positive_count = 0
netflix_negative_count = 0

disneyplus_total_count = 0
disneyplus_positive_count = 0
disneyplus_negative_count = 0
for tweet in tweets:
  tweet_sentiment_score = sentiment_score(tweet['text'])
  tweet_tokens = set(tokenize(tweet['text']))
  if {'netflix'}.intersection(tweet_tokens):
    netflix_total_count += 1
    if tweet_sentiment_score>=0:
      netflix_positive_count += 1
    else:
      netflix_negative_count += 1
  # print(tweet_tokens)
  if {'disney'}.intersection(tweet_tokens):
    disneyplus_total_count += 1
    if tweet_sentiment_score>=0:
      disneyplus_positive_count += 1
    else:
      disneyplus_negative_count += 1

In [None]:
#Calculate the summary statistics
Netflix_summary = {
    'NTweets' : netflix_total_count,
    'ShareofVoice' : 100*netflix_total_count/(netflix_total_count+disneyplus_total_count),
    'PositivePct' : 100*netflix_positive_count/netflix_total_count,
    'NegativePct' : 100*netflix_negative_count/netflix_total_count,
    'NetPositivePct' : 100*(netflix_positive_count-netflix_negative_count)/netflix_total_count
}
Disneyplus_summary = {
    'NTweets' : disneyplus_total_count,
    'ShareofVoice' : 100*disneyplus_total_count/(netflix_total_count+disneyplus_total_count),
    'PositivePct' : 100*disneyplus_positive_count/disneyplus_total_count,
    'NegativePct' : 100*disneyplus_negative_count/disneyplus_total_count,
    'NetPositivePct' : 100*(disneyplus_positive_count-disneyplus_negative_count)/disneyplus_total_count
}


print("")
print(' | '+
      'Channels          | '+
      '# Tweets       | '+
      'Share of Voice | '+
      'Positive %     | '+
      'Negative %     | '+
      'Net Positive % | ')
print("")
print((" | "+
      "Netflix         | "+
      "{NTweets:5.0f}          | "+
      "{ShareofVoice:5.2f}%         | "+
      "{PositivePct:2.2f}%         | "+
      "{NegativePct:2.2f}%         | "+
      "{NetPositivePct:2.2f}%         | ").format(**Netflix_summary))
print((" | "+
      "Disneyplus  | "+
      "{NTweets:5.0f}          | "+
      "{ShareofVoice:5.2f}%         | "+
      "{PositivePct:2.2f}%         | "+
      "{NegativePct:2.2f}%         | "+
      "{NetPositivePct:2.2f}%         | ").format(**Disneyplus_summary))
print("")


 | Channels          | # Tweets       | Share of Voice | Positive %     | Negative %     | Net Positive % | 

 | Netflix         |    63          | 73.26%         | 52.38%         | 47.62%         | 4.76%         | 
 | Disneyplus  |    23          | 26.74%         | 56.52%         | 43.48%         | 13.04%         | 

