In [1]:
!pip install --user -U nltk

Collecting nltk
  Using cached nltk-3.6.5-py3-none-any.whl (1.5 MB)
  Using cached nltk-3.6.3-py3-none-any.whl (1.5 MB)


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.conf import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import subjectivity
from nltk.corpus import stopwords
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
spark_conf = SparkConf()

spark_conf.setAll([
    ('spark.master', 'spark://spark-master:7077'),
    ('spark.app.name', 'spark-engsoft37'),
])

spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/30 22:46:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Lendo o arquivo csv
schema = StructType([
        StructField("type", StringType(), True),
        StructField("id", StringType(), True),
        StructField("subreddit.id", StringType(), True),
        StructField("subreddit.name", StringType(), True),
        StructField("subreddit.nsfw", StringType(), True),
        StructField("created_utc", IntegerType(), True),
        StructField("permalink", StringType(), True),
        StructField("body", StringType(), True),
        StructField("sentiment", FloatType(), True),
        StructField("score", IntegerType(), True),
    ])

dataframe = spark\
        .read\
        .schema(schema) \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("delimiter", ",") \
        .csv("the-reddit-covid-dataset-comments.csv")

In [5]:
df_body = dataframe.select("body").where(col("body").isNotNull())
df_comment = df_body.withColumnRenamed("body", "comment")
df_comment.take(1)

                                                                                

[Row(comment='When you scheduled your booster with CVS does it just give you the option of Vaccines: COVID-19 (Vaccine brand) or does it specifically say booster')]

In [6]:
# https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

# Gerar lista de palavras a partir de um texto

def tokenize_string(row):
    words = tokenize.word_tokenize(row.comment) # gera uma lista de palavras
    # new_words = [word for word in words if word.isalnum()] # remove caracteres especiais
    return words

In [7]:
transform_1 = df_comment.rdd.map(tokenize_string)

In [None]:
transform_1.take(1)

In [None]:
# Normalizar os dados

# def lemmatize_sentence(tokens):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_sentence = []
#     for word, tag in pos_tag(tokens):
#         if tag.startswith('NN'):
#             pos = 'n'
#         elif tag.startswith('VB'):
#             pos = 'v'
#         else:
#             pos = 'a'
#         lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
#     return lemmatized_sentence 

In [None]:
# transform_2 = transform_1.map(lemmatize_sentence)

In [None]:
# transform_2.take(1)

In [8]:
# Normalizar os dados e Remover palavras desencessárias
import re, string

def remove_noise(tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

transform_2 = transform_1.map(lambda row: remove_noise(row, stop_words))

In [None]:
transform_2.take(1)

In [None]:
# Obtendo a frequência em que as palavras aparecem

# def get_all_words(cleaned_tokens_list):
#     for tokens in cleaned_tokens_list:
#         for token in tokens:
#             yield token

In [10]:
cleaned_words = transform_2.take(10000)

                                                                                

In [None]:
# all_pos_words = get_all_words(cleaned_words)

In [None]:
# from nltk import FreqDist

# freq_dist_pos = FreqDist(all_pos_words)

In [None]:
# print(freq_dist_pos.most_common(30))

In [11]:
# Convertendo os Token em um dicionário

def get_comments_for_model(cleaned_tokens_list):
    for comment_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in comment_tokens)

In [12]:
comments_model = get_comments_for_model(cleaned_words)

In [13]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in cleaned_words])

In [None]:
# all_words_neg

In [16]:
# Cria um unigram (n-grama) e extrai as features

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq = 4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)
training_set = sentim_analyzer.apply_features(cleaned_words)

In [20]:
# Criando dataset de treino e de teste

dataset = [(comment_dict, "") for comment_dict in comments_model]

train_data = dataset[:7000]
test_data = dataset[7000:]

In [21]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

In [22]:
classify.accuracy(classifier, test_data)

1.0

In [None]:
classifier.show_most_informative_features(10)