In [51]:
# trovare le parole più frequenti per ogni sentimento
import json
import re
from typing import List, Dict
import numpy as np

import nltk

from src.Tweet import TweetInfo, Tweet, lex_resources_directory, tweets_directory

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
# Se ne abbiamo voglia possiamo mettere le emoticons e gli emoji su file e per poi leggerli

import os
from typing import List, Dict, Set

import pymongo
import mariadb
import sys

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Class used to store information about each tweet: its sentiment e how many words of which sentiment are in that tweet

In [54]:
class TweetInfo:
    sentiment: str
    sentiment_occurrences: Dict[str, int]

    def __init__(self, sentiment: str, sentiments_for_words: List[str]):
        self.sentiment = sentiment

        self.sentiment_occurrences = {}
        for sentiment_for_words in sentiments_for_words:
            # initiate every word sentiment occurrence to 0
            self.sentiment_occurrences[sentiment_for_words] = 0

    def increase_sentiment_counter(self, sentiment: str):
        self.sentiment_occurrences[sentiment] = self.sentiment_occurrences.get(sentiment) + 1

    def print_tweet_info(self):
        print("tweet sentiment: ", self.sentiment)
        for word_sentiment in self.sentiment_occurrences:
            print("\t word sentiment: ", word_sentiment)
            print("\t occurrences: ", self.sentiment_occurrences[word_sentiment])

    def get_sentiment(self) -> str:
        return sentiment


In [55]:
class LexicalResource:
    filename: str
    sentiment: str
    word_list: List[str]

    def __init__(self, filename: str, sentiment: str):
        self.filename = filename
        self.sentiment = sentiment
        self.word_list = []

    def __str__(self):
        lex_res_string = "LexicalResource: " + self.filename + \
                         "\n\t sentiment: " + self.sentiment + \
                         "\n\t wordlist: " + self.word_list.__str__()
        return lex_res_string

    def add_word(self, word: str):
        if not '_' in word:
            self.word_list.append(word)

    def add_word_list(self, word_list: List[str]):
        for word in word_list:
            if not '_' in word:
                self.word_list.append(word)

    def get_number_of_words(self) -> int:
        return len(self.word_list)


## Pipeline

In [56]:
class Tweet:
    index: int
    text: str
    hashtags: List[str]
    emojis: List[str]
    emoticons: List[str]
    tokens: List[str]
    words: List[str]
    pos_tags: Dict[str, str]
    tweet_stem_count: TweetInfo
    word_frequency: Dict[str, int] = {}
    sentiment: str

    def __init__(self, tweet_raw: str, index: int, sentiment: str):
        self.index = index
        self.text = tweet_raw
        self.anonymize()
        self.read_hashtags()
        self.read_emojis()
        self.read_emoticons()
        self.to_lower()
        self.tokenize()
        self.process_slangs()
        self.pos_tagging()
        self.remove_punctuation()
        self.lemming()
        self.remove_stop_words()
        self.count_words_frequency()
        self.sentiment = sentiment

    # TODO fai il to string al posto del print tweet
    def __str__(self):
        tweet_string = "Tweet\n"
        tweet_string = tweet_string + "\ttweet raw: " + self.text
        tweet_string = tweet_string + "\tpos tags: " + json.dumps(self.pos_tags)
        tweet_string = tweet_string + "\n"
        return tweet_string

    def read_hashtags(self) -> None:
        self.hashtags = re.findall(r"#(\w+)", self.text)

    def read_emojis(self) -> None:
        self.emojis = get_elems_from_text_if_in_list(self.text, EMOJIS)

    def read_emoticons(self) -> None:
        self.emoticons = get_elems_from_text_if_in_list(self.text, EMOTICONS)

    def anonymize(self) -> None:
        self.text = self.text.replace("USERNAME", "").replace("URL", "")

    def to_lower(self) -> None:
        self.text = self.text.lower()

    def tokenize(self) -> None:
        # Questa funzione mi sa che non andava bene, poi vediamo
        self.tokens = nltk.word_tokenize(self.text)
        # self.tokens = sent_tokenize(self.text)

    def pos_tagging(self) -> None:
        pos_tag_list = nltk.pos_tag(self.tokens)

        # get keys of pos tag list
        pos_tag_keys = [pos_tag[0] for pos_tag in pos_tag_list]
        # get values of pos tag list
        pos_tag_values = [pos_tag[1] for pos_tag in pos_tag_list]
        # create dictionary
        pos_tag_dict: Dict[str, str] = {pos_tag_keys[i]: pos_tag_values[i] for i in range(len(pos_tag_keys))}

        self.pos_tags = pos_tag_dict

    def remove_punctuation(self) -> None: # rimuove anche emoticons, giusto?
        # Removes every character besides lower and uppercase letters, numbers and spaces
        # self.text = re.sub(r'[^a-zA-Z0-9 ]', '', self.text)
        for tag_key in list(self.pos_tags.keys()):
            if tag_key in PUNCTUATION_MARKS:
                # tag key is a punctuation mark, so remove from pos tagging list
                del self.pos_tags[tag_key]

    def print_tweet(self) -> None:
        print("tweet raw: ", self.text)
        print("pos tagging: ", self.pos_tags)
        # print("\n\ttokens ", self.tokens)
        # print("\n\thashtag_list ", self.hashtags)
        # print("\n\temoji_list ", self.emojis)
        # print("\n\temoticon_list ", self.emoticons)
        # print("\n\twords_list ", self.get_words())

    def lemming(self) -> None:
        lemmatizer = WordNetLemmatizer()
        tweet_words_lemmatized: List[str] = []

        tweet_words = self.get_words()
        for word in tweet_words:
            tweet_words_lemmatized.append(lemmatizer.lemmatize(word))

    def remove_stop_words(self) -> None:

        # TODO implement this
        stop_words = set(stopwords.words('english'))

        tokens = self.tokens
        for w in tokens:
            if w in stop_words:
                tokens.remove(w)
        self.words = tokens

        pos_tags = self.pos_tags # removing stop words from pos tags
        for w in pos_tags:
            if w in stop_words:
                self.pos_tags = removekey(self.pos_tags, w)


    def process_slangs(self) -> None:
        for slang in SLANGS:
            # replaces the slang with the extension for every slang in the text
            self.text = self.text.replace(slang, SLANGS[slang])

    def count_words_frequency(self):
        words = self.words
        for word in words:
            self.word_frequency[word] = words.count(word)

    # Support functions

    def get_words(self) -> List[str]:
        return self.text.split()

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

In [57]:
def get_elems_from_text_if_in_list(text: str, list: List[str]) -> List[str]:
    """
    Return list of substrings of text that appear in list
    """
    matches = []
    words = text.split()
    for word in words:
        if word in list:
            matches.append(word)

    return matches

## (function) Read files in directory
General function to read text files from a directory and merge them

In [58]:
lex_resources_list: List[LexicalResource] = []

def read_texts_in_directory(directory_path: str, sentiment: str) -> List[str]:
    files_text_list: List[str] = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file = open(file=file_path, encoding="utf8")
            file_text = file.read().split() # list of words for a single lex resource of a sentiment
            files_text_list = files_text_list + file_text

            lex_res: LexicalResource = LexicalResource(filename, sentiment)
            lex_res.add_word_list(file_text)
            global lex_resources_list
            lex_resources_list.append(lex_res)

    # print(len(lex_resources_list))
    # [print(i) for i in lex_resources_list]
    return files_text_list

## (function) Read lexical resources for a sentiment
Function which reads all the lexical resources for a sentiment
The directory containing all lexical resources for that sentiment is passed as parameter
Returns a set of the words in all the lexical resources of a sentiment
### forse creare per ogni lex res di OGNI sentimento un dizionario diverso? Bisogna vedere come caricare i dati su db, bisogna caricare ogni lex res diversa di ogni sentimento sul db

In [59]:
def read_lex_resources_sentiment(sentiment_lex_resources_directory: str, sentiment: str) -> Set[str]:
    resource_words: Set[str] = set()
    resources_text: List[str] = read_texts_in_directory(sentiment_lex_resources_directory, sentiment)
    for word in resources_text:
        if not '_' in word:
            resource_words.add(word)
    #print(sentiment, "\n", resource_words, "\n\n")
    return resource_words

### Read all lexical resources

Reads all the lexical resources and returns a dictionary of word to sentiment

In [60]:
sentiment_lex_resources: Dict[str, str] = {}

for resources_path, sentiments, _ in os.walk(lex_resources_directory):
    # The folders inside the lexical resources folder are named after a sentiment (Ex. Anger, Joy), each of them contain some files and each of them is a list of words that are associated with that sentiment
    """WARN Ad ogni ciclo ci sarebbe per forza un singolo sentiment dato che cicla sulla directory delle directory di lex res?"""
    for sentiment in sentiments:
        # iterate each folder (one for sentiment)
        resources_sentiment_path = os.path.join(resources_path, sentiment)
        sentiment_words_set: Set[str] = read_lex_resources_sentiment(resources_sentiment_path, sentiment)

        # read the files containing lists of words, and return a set of all the words in those files
        for sentiment_word in sentiment_words_set:
            # associate each word of the set to the corresponding sentiment
            sentiment_lex_resources[sentiment_word] = sentiment

lex_word_to_sentiment = sentiment_lex_resources
print(lex_word_to_sentiment)

{'pensa': 'Joy', 'angry': 'Anger', 'bubu': 'Anger', 'bu': 'Anger', 'ya': 'Anger', 'bbu': 'Anger', 'no': 'Anger', 'privacy': 'Joy', 'boella': 'Joy', 'yes': 'Joy'}


## Tweet reading

## (function) Reads a file and converts the text to tweets


In [61]:
def read_tweet_file(file_path_string: str, sentiment: str) -> List[Tweet]:
    """
    Reads a file and converts the text to tweets
    :param file_path_string: string of the path to the file
    """

    # tweets read from file
    tweets_read: List[Tweet] = []

    tweets_file = open(file=file_path_string, encoding="utf8")
    tweets_text: List[str] = tweets_file.readlines()

    # For each tweet text create a Tweet object
    i = 0
    for tweet_text in tweets_text:
        i = i + 1
        new_tweet = Tweet(tweet_text, i, sentiment)
        tweets_read.append(new_tweet)

    return tweets_read

Get list of sentiments

In [62]:
sentiments: List[str] = [sentiment for sentiment in os.listdir(lex_resources_directory)]

## Read tweets folder and load Tweet Info for stem counting
The tweets folder contains for each sentiment a file containing tweets of that sentiment. Each file is scanned and for each tweet a TweetInfo object is created in order to maintain the count of how many word of which sentiments are in it

# Ma associare il TweetInfo al tweet senza fare un altro dict?

In [63]:
def get_tweet_sentiment_from_file_name(file_name: str):
    extension_removed = file_name.split(".")[0]
    sentiment = extension_removed.split("_")[-2]
    return sentiment

In [64]:
tweets_to_info: Dict[Tweet, TweetInfo] = {}
for tweets_sentiments_directory, _, tweets_sentiments_filenames in os.walk(tweets_directory):
    # print(tweets_sentiments_directory)
    # print(tweets_sentiments_files)

    for tweets_sentiment_filename in tweets_sentiments_filenames:
        # print(tweets_sentiment_file)
        tweets_sentiment_filepath = os.path.join(tweets_sentiments_directory, tweets_sentiment_filename)
        sentiment = get_tweet_sentiment_from_file_name(tweets_sentiment_filename)
        tweets_for_sentiment: List[Tweet] = read_tweet_file(tweets_sentiment_filepath, sentiment)
        #print("Tweets for sentiment: ", sentiment, "\n")
        for tweet in tweets_for_sentiment:
            tweet_info: TweetInfo = TweetInfo(sentiment, sentiments)
            tweet.tweet_stem_count = TweetInfo
            tweets_to_info[tweet] = tweet_info

## Stem counting
For each tweet and each word of them is checked the sentiment and increased the counter for that sentiment in the TweetInfo object associated

In [65]:
for tweet in tweets_to_info:
    tweet_info = tweets_to_info[tweet]
    tweet_words: List[str] = tweet.get_words()

    for word in tweet_words:
        if word in lex_word_to_sentiment:
            # get the sentiment for the word and increase sentiment counter by 1
            sentiment = lex_word_to_sentiment[word]
            tweet_info.increase_sentiment_counter(sentiment)

### Test print

In [66]:
def print_tweets():
    for tweet in tweets_to_info.keys():
        info = tweets_to_info[tweet]
        print(tweet)
        print("sentiment: " + info.sentiment)
        print("sentiment occurrences: ")
        print(info.sentiment_occurrences)
        print("---")

In [67]:
print_tweets()

Tweet
	tweet raw:  yes the cillo is very chill smoking on the balcony ;)
	pos tags: {"yes": "RB", "cillo": "NN", "chill": "JJ", "smoking": "VBG", "balcony": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 1}
---
Tweet
	tweet raw: wow i'm having fun doing the smoking
	pos tags: {"wow": "NN", "'m": "VBP", "fun": "NN", "smoking": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: yea  yea
	pos tags: {"yea": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: boss	pos tags: {"boss": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: angry pensa is angry sad banana no
	pos tags: {"angry": "JJ", "pensa": "NN", "sad": "JJ", "banana": "NN"}

sentiment: pensa
sentiment occurrences: 
{'Anger': 3, 'Joy': 1}
---
Tweet
	tweet raw: angry boella no pensa kill ;( yoyou ah rip bu
	pos tags: {"angry": "JJ", "boella": "NN", "pensa": "NN", "kill": "NN", "ah": "VB

## Connection to MongoDB

In [68]:
already_connected = False
if not already_connected:
    mongo_client = pymongo.MongoClient("mongodb+srv://Peppino:wHzr79JxnRUgK52@cluster0.zkagq.mongodb.net/?retryWrites=true&w=majority")
    mydb = mongo_client["maadb_tweets"]
    print(mydb.list_collection_names())

coll_list = mydb.list_collection_names()


[print(i) for i in lex_resources_list]

['Tweets', 'LexResourcesWords', 'LexResources']
LexicalResource: angry
	 sentiment: Anger
	 wordlist: ['no', 'pensa', 'angry', 'bu']
LexicalResource: pensa_angry.txt
	 sentiment: Anger
	 wordlist: ['bbu', 'bubu', 'ya', 'bu']
LexicalResource: happy
	 sentiment: Joy
	 wordlist: ['yes', 'boella', 'privacy', 'pensa']


[None, None, None]

### Check in which resources each word is contained
Creates a dictionary <word, lex_res_list> to map each word with the lexical resources which contains the word

In [69]:
map_word_lex_res: Dict[str, List[str]] = {}

for word in lex_word_to_sentiment:
    for lex_res in lex_resources_list:
        if word in lex_res.word_list:
            if map_word_lex_res.get(word) is None:
                map_word_lex_res[word] = [lex_res.filename]
            else:
                map_word_lex_res[word].append(lex_res.filename)

print(map_word_lex_res)

{'bu': ['angry', 'pensa_angry.txt'], 'no': ['angry'], 'bubu': ['pensa_angry.txt'], 'pensa': ['angry', 'happy'], 'angry': ['angry'], 'bbu': ['pensa_angry.txt'], 'ya': ['pensa_angry.txt'], 'boella': ['happy'], 'privacy': ['happy'], 'yes': ['happy']}


### Flags to manage queries

In [70]:
delete_lex_res = True
insert_lex_res = False

delete_lex_res_words = True
insert_lex_res_words = False

delete_tweets = True
insert_tweets = False

### Insert/delete Lexical Resources

In [71]:
db_lex_res_collection = mydb["LexResources"]

if delete_lex_res:
    db_lex_res_collection.delete_many({})

if insert_lex_res:
    for lex_res in lex_resources_list:
        to_upload = {"_id" : lex_res.filename,
                     "sentiment": lex_res.sentiment,
                     "totNumberWords" : lex_res.get_number_of_words()}
        inserted_lex_res = db_lex_res_collection.insert_one(to_upload)
        print(inserted_lex_res.inserted_id)
        # until here inserted lexical resources basic information in LexRes collection

### Insert/delete words of lexical resources

In [72]:
map_lex_word_db_id: Dict[str, int] = {}
db_lex_res_words_collection = mydb["LexResourcesWords"]

if delete_lex_res_words:
    db_lex_res_words_collection.delete_many({})

if insert_lex_res_words:
    # for each word in all the lexical resources insert in LexResWords the word and a
    # list of pairs <$ref, $id> to track in which LexRes the word is contained
    for word in lex_word_to_sentiment:
        list_lex_res = map_word_lex_res[word] # list of lexical resources in which the word is contained
        resources = [] # list of pairs to insert in LexResWords

        for res in list_lex_res: # populate list adding, one at a time, the lexical resources in which the word is contained
            resources.append({"$ref": "LexResources", "$id": res})

        word_to_upload = {"lemma" : word,
                          "resources" : resources}
        inserted_lex_res_word = db_lex_res_words_collection.insert_one(word_to_upload)
        map_lex_word_db_id[word] = inserted_lex_res_word.inserted_id # save object id to use it later to reference resources words from tweet words
        print(word_to_upload)

print("\n\n", map_lex_word_db_id)



 {}


### Insert/delete tweets

In [73]:
db_tweets_collection = mydb["Tweets"]

if delete_tweets:
    db_tweets_collection.delete_many({})

if insert_tweets:
    for tweet in tweets_to_info:
        tweet_words_upload = []
        for word in tweet.pos_tags:
            if map_lex_word_db_id.get(word) is None:
                # Decide what to do with words that do not have a lexical resource associated, we could think about associating it to a resource or some other strategy.
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : "None"})
            else:
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : {"$ref": "LexResourcesWords", "$id": map_lex_word_db_id[word]}})

        tweet_to_upload = {
            "sentiment": tweet.sentiment,
            "index": tweet.index,
            "words" : tweet_words_upload,
            "hashtags" : tweet.hashtags,
            "emojis" : tweet.emojis,
            "emoticons" : tweet.emoticons}

        inserted_tweets = db_tweets_collection.insert_one(tweet_to_upload)

        print(tweet)
        print(tweet_to_upload, "\n\n")

In [74]:
db_tweets_collection = mydb["Tweets"]

if delete_tweets:
    db_tweets_collection.delete_many({})

if insert_tweets:
    for tweet in tweets_to_info:
        tweet_words_upload = []
        for word in tweet.pos_tags:
            if map_lex_word_db_id.get(word) is None:
                # Decide what to do with words that do not have a lexical resource associated, we could think about associating it to a resource or some other strategy.
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : "None"})
            else:
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : {"$ref": "LexResourcesWords", "$id": map_lex_word_db_id[word]}})

        tweet_to_upload = {
            "sentiment": tweet.sentiment,
            "index": tweet.index,
            "words" : tweet_words_upload,
            "hashtags" : tweet.hashtags,
            "emojis" : tweet.emojis,
            "emoticons" : tweet.emoticons}

        inserted_tweets = db_tweets_collection.insert_one(tweet_to_upload)

        print(tweet)
        print(tweet_to_upload, "\n\n")

## Connection to MariaDB

In [50]:
# already_connected_mariadb = False
# # Connect to MariaDB Platform
# if not already_connected_mariadb:
#     try:
#         conn = mariadb.connect(
#             user="root",
#             password="armando12",
#             host="localhost",
#             port=3306,
#             database="maadb_tweets"
#         )
#     except mariadb.Error as e:
#         print(f"Error connecting to MariaDB Platform: {e}")
#         sys.exit(1)
#
#     # Get Cursor
#     cur = conn.cursor()
#
# cur.execute("SHOW TABLES")
#
# for (table_name,) in cur:
#     print(table_name)

In [75]:
db_tweets_collection = mydb["Tweets"]

pipeline = [
     # First Stage
    {
        "$group":
            {
                "_id" : "$sentiment",
                "sum_of_something_example": { "$sum": { "$multiply": [ "$price", "$quantity" ] } }
            }
     },
     # Second Stage
    {
        "$match": { "sum_of_something_example": { "$gte": 100 } }
    }
]

mydb.db_tweets_collection.aggregate(pipeline)

SyntaxError: invalid syntax (1427145301.py, line 6)