In [1]:
!pip install --user -U nltk

Collecting nltk
  Using cached nltk-3.6.5-py3-none-any.whl (1.5 MB)
  Using cached nltk-3.6.3-py3-none-any.whl (1.5 MB)


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.conf import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import subjectivity
from nltk.corpus import stopwords
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
spark_conf = SparkConf()

spark_conf.setAll([
    ('spark.master', 'spark://spark-master:7077'),
    ('spark.app.name', 'spark-engsoft37'),
])

spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/30 23:48:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark_context = spark.sparkContext

In [5]:
# Obtendo dataset para criar modelo de análise de sentimento
from nltk.corpus import twitter_samples

# positive_tweets = twitter_samples.strings('positive_tweets.json')
# negative_tweets = twitter_samples.strings('negative_tweets.json')
# text = twitter_samples.strings('tweets.20150430-223406.json')
# tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [6]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_token = spark_context.parallelize(twitter_samples.tokenized('positive_tweets.json'))
negative_token = spark_context.parallelize(twitter_samples.tokenized('negative_tweets.json'))

# positive_cleaned_tokens_list = []
# negative_cleaned_tokens_list = []

# for tokens in positive_tweet_tokens:
#     positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

# for tokens in negative_tweet_tokens:
#     negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

positive_cleaned_tokens_list = positive_token.map(lambda r: remove_noise(r, stop_words))
negative_cleaned_tokens_list = negative_token.map(lambda r: remove_noise(r, stop_words)) 

In [9]:
positive_cleaned_tokens_list.take(1)

                                                                                

[['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']]

In [10]:
negative_cleaned_tokens_list.take(1)

                                                                                

[['hopeless', 'tmr', ':(']]

In [11]:
# Determining Word Density

# def get_all_words(cleaned_tokens_list):
#     for tokens in cleaned_tokens_list:
#         for token in tokens:
#             yield token

# all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [12]:
# from nltk import FreqDist

# freq_dist_pos = FreqDist(all_pos_words)
# print(freq_dist_pos.most_common(30))

In [13]:
# Preparing Data for the Model
### Converting Tokens to a Dictionary

def get_tweets_for_model(cleaned_tokens_list):
    #for tweet_tokens in cleaned_tokens_list:
    return dict([token, True] for token in cleaned_tokens_list)

positive_tokens_for_model = positive_cleaned_tokens_list.map(get_tweets_for_model)
negative_tokens_for_model = negative_cleaned_tokens_list.map(get_tweets_for_model)

In [14]:
positive_tokens_for_model.take(1)

                                                                                

[{'#followfriday': True,
  'top': True,
  'engage': True,
  'member': True,
  'community': True,
  'week': True,
  ':)': True}]

In [15]:
negative_tokens_for_model.take(1)

                                                                                

[{'hopeless': True, 'tmr': True, ':(': True}]

In [16]:
### Splitting the Dataset for Training and Testing the Model
import random

def create_dataset(tokens_list: list, sentiment: str):
    # (token_dict, sentiment) for token_dict in tokens_list
    return (tokens_list, sentiment)

# positive_dataset = [(tweet_dict, "Positive")
#                      for tweet_dict in positive_tokens_for_model]

# negative_dataset = [(tweet_dict, "Negative")
#                      for tweet_dict in negative_tokens_for_model]

positive_dataset = positive_tokens_for_model.map(lambda r: create_dataset(r, "Positive"))
negative_dataset = negative_tokens_for_model.map(lambda r: create_dataset(r, "Negative"))

# dataset = positive_dataset + negative_dataset

# random.shuffle(dataset)

# train_data = dataset[:7000]
# test_data = dataset[7000:]

In [17]:
positive_dataset.take(1)

                                                                                

[({'#followfriday': True,
   'top': True,
   'engage': True,
   'member': True,
   'community': True,
   'week': True,
   ':)': True},
  'Positive')]

In [18]:
negative_dataset.take(1)

                                                                                

[({'hopeless': True, 'tmr': True, ':(': True}, 'Negative')]

In [19]:
dataset = positive_dataset.union(negative_dataset).take(10000)

                                                                                

In [20]:
random.shuffle(dataset)

In [21]:
train_data = dataset[:7000]
test_data = dataset[7000:]

In [22]:
train_data[6999]

({'yisss': True,
  '#bindingofissac': True,
  'wait': True,
  '#xboxone': True,
  'release': True,
  ':)': True,
  'well': True,
  'console': True},
 'Positive')

In [23]:
test_data[0]

({'aw': True,
  ':/': True,
  'angeke': True,
  'sbali': True,
  'euuuwwww': True,
  'lunch': True,
  'like': True,
  'real': True,
  'construction': True,
  'worker': True,
  ':-(': True},
 'Negative')

In [24]:
# Building and Testing the Model

from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9973333333333333
Most Informative Features
                      :) = True           Positi : Negati =    973.9 : 1.0
                     sad = True           Negati : Positi =     58.9 : 1.0
                follower = True           Positi : Negati =     36.6 : 1.0
                     bam = True           Positi : Negati =     20.8 : 1.0
                     x15 = True           Negati : Positi =     15.8 : 1.0
              appreciate = True           Positi : Negati =     14.2 : 1.0
               community = True           Positi : Negati =     14.2 : 1.0
                followed = True           Negati : Positi =     13.5 : 1.0
                    sick = True           Negati : Positi =     13.5 : 1.0
                 welcome = True           Positi : Negati =     12.0 : 1.0
None


In [25]:
# Aplicando o modelo em comentário de posts do reddit

# Lendo o arquivo csv
schema = StructType([
        StructField("type", StringType(), True),
        StructField("id", StringType(), True),
        StructField("subreddit.id", StringType(), True),
        StructField("subreddit.name", StringType(), True),
        StructField("subreddit.nsfw", StringType(), True),
        StructField("created_utc", IntegerType(), True),
        StructField("permalink", StringType(), True),
        StructField("body", StringType(), True),
        StructField("sentiment", FloatType(), True),
        StructField("score", IntegerType(), True),
    ])

dataframe = spark\
        .read\
        .schema(schema) \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("delimiter", ",") \
        .csv("the-reddit-covid-dataset-comments.csv")

In [26]:
df_body = dataframe.select("body").where(col("body").isNotNull())
df_comments = df_body.withColumnRenamed("body", "comment")
df_comments.take(1)

                                                                                

[Row(comment='When you scheduled your booster with CVS does it just give you the option of Vaccines: COVID-19 (Vaccine brand) or does it specifically say booster')]

In [27]:
def tokenize_string(row):
    # words = tokenize.word_tokenize(row.comment) # gera uma lista de palavras
    # new_words = [word for word in words if word.isalnum()] # remove caracteres especiais
    return tokenize.word_tokenize(row.comment)

In [28]:
comments_tokenized = df_comments.rdd.map(tokenize_string)

In [29]:
comments_tokenized.take(1)

                                                                                

[['When',
  'you',
  'scheduled',
  'your',
  'booster',
  'with',
  'CVS',
  'does',
  'it',
  'just',
  'give',
  'you',
  'the',
  'option',
  'of',
  'Vaccines',
  ':',
  'COVID-19',
  '(',
  'Vaccine',
  'brand',
  ')',
  'or',
  'does',
  'it',
  'specifically',
  'say',
  'booster']]

In [30]:
comments_removed_noises = comments_tokenized.map(remove_noise)

In [31]:
comments_removed_noises.take(1)

                                                                                

[['when',
  'you',
  'schedule',
  'your',
  'booster',
  'with',
  'cvs',
  'do',
  'it',
  'just',
  'give',
  'you',
  'the',
  'option',
  'of',
  'vaccines',
  'covid-19',
  'vaccine',
  'brand',
  'or',
  'do',
  'it',
  'specifically',
  'say',
  'booster']]

In [32]:
def classify_comment(comment):
    token = dict([token, True] for token in comment)
    sentiment = classifier.classify(token)
    return (comment, sentiment)
    
comments_sentiments = comments_removed_noises.map(classify_comment)

In [33]:
comments_sentiments.take(2)

                                                                                

[(['when',
   'you',
   'schedule',
   'your',
   'booster',
   'with',
   'cvs',
   'do',
   'it',
   'just',
   'give',
   'you',
   'the',
   'option',
   'of',
   'vaccines',
   'covid-19',
   'vaccine',
   'brand',
   'or',
   'do',
   'it',
   'specifically',
   'say',
   'booster'],
  'Negative'),
 (['did',
   "n't",
   'stop',
   'price',
   'there',
   'though',
   'new',
   'zealand',
   'and',
   'canada',
   'grow',
   'at',
   'about',
   'the',
   'same',
   'rate',
   'through',
   'covid'],
  'Negative')]