In [1]:
from src.LexicalResource import LexicalResource
from src.Tweet import TweetInfo, Tweet, lex_resources_directory, tweets_directory
# Se ne abbiamo voglia possiamo mettere le emoticons e gli emoji su file e per poi leggerli

import os
from typing import List, Dict, Set

import pymongo
import mariadb
import sys

## Pipeline

## (function) Read files in directory
General function to read text files from a directory and merge them

In [2]:
lex_resources_list: List[LexicalResource] = []

def read_texts_in_directory(directory_path: str, sentiment: str) -> List[str]:
    files_text_list: List[str] = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file = open(file=file_path, encoding="utf8")
            file_text = file.read().split() # list of words for a single lex resource of a sentiment
            files_text_list = files_text_list + file_text

            lex_res: LexicalResource = LexicalResource(filename, sentiment)
            lex_res.add_word_list(file_text)
            global lex_resources_list
            lex_resources_list.append(lex_res)

    # print(len(lex_resources_list))
    # [print(i) for i in lex_resources_list]
    return files_text_list

## (function) Read lexical resources for a sentiment
Function which reads all the lexical resources for a sentiment
The directory containing all lexical resources for that sentiment is passed as parameter
Returns a set of the words in all the lexical resources of a sentiment
### forse creare per ogni lex res di OGNI sentimento un dizionario diverso? Bisogna vedere come caricare i dati su db, bisogna caricare ogni lex res diversa di ogni sentimento sul db

In [3]:
def read_lex_resources_sentiment(sentiment_lex_resources_directory: str, sentiment: str) -> Set[str]:
    resource_words: Set[str] = set()
    resources_text: List[str] = read_texts_in_directory(sentiment_lex_resources_directory, sentiment)
    for word in resources_text:
        if not '_' in word:
            resource_words.add(word)
    #print(sentiment, "\n", resource_words, "\n\n")
    return resource_words

### Read all lexical resources

Reads all the lexical resources and returns a dictionary of word to sentiment

In [4]:
sentiment_lex_resources: Dict[str, str] = {}

for resources_path, sentiments, _ in os.walk(lex_resources_directory):
    # The folders inside the lexical resources folder are named after a sentiment (Ex. Anger, Joy), each of them contain some files and each of them is a list of words that are associated with that sentiment
    for sentiment in sentiments:
        # iterate each folder (one for sentiment)
        resources_sentiment_path = os.path.join(resources_path, sentiment)
        sentiment_words_set: Set[str] = read_lex_resources_sentiment(resources_sentiment_path, sentiment)

        # read the files containing lists of words, and return a set of all the words in those files
        for sentiment_word in sentiment_words_set:
            # associate each word of the set to the corresponding sentiment
            sentiment_lex_resources[sentiment_word] = sentiment

lex_word_to_sentiment = sentiment_lex_resources
print(lex_word_to_sentiment)

{'bu': 'Anger', 'ya': 'Anger', 'pensa': 'Joy', 'no': 'Anger', 'angry': 'Anger', 'bbu': 'Anger', 'bubu': 'Anger', 'yes': 'Joy', 'privacy': 'Joy', 'boella': 'Joy'}


## Tweet reading

## (function) Reads a file and converts the text to tweets


In [5]:
def read_tweet_file(file_path_string: str, sentiment: str) -> List[Tweet]:
    """
    Reads a file and converts the text to tweets
    :param file_path_string: string of the path to the file
    """

    # tweets read from file
    tweets_read: List[Tweet] = []

    tweets_file = open(file=file_path_string, encoding="utf8")
    tweets_text: List[str] = tweets_file.readlines()

    # For each tweet text create a Tweet object
    i = 0
    for tweet_text in tweets_text:
        i = i + 1
        new_tweet = Tweet(tweet_text, i, sentiment)
        tweets_read.append(new_tweet)

    return tweets_read

Get list of sentiments

In [6]:
sentiments: List[str] = [sentiment for sentiment in os.listdir(lex_resources_directory)]

## Read tweets folder and load Tweet Info for stem counting
The tweets folder contains for each sentiment a file containing tweets of that sentiment. Each file is scanned and for each tweet a TweetInfo object is created in order to maintain the count of how many word of which sentiments are in it

# Ma associare il TweetInfo al tweet senza fare un altro dict?

In [7]:
def get_tweet_sentiment_from_file_name(file_name: str):
    extension_removed = file_name.split(".")[0]
    sentiment = extension_removed.split("_")[-2]
    return sentiment

In [8]:
tweets_to_info: Dict[Tweet, TweetInfo] = {}
for tweets_sentiments_directory, _, tweets_sentiments_filenames in os.walk(tweets_directory):
    # print(tweets_sentiments_directory)
    # print(tweets_sentiments_files)

    for tweets_sentiment_filename in tweets_sentiments_filenames:
        # print(tweets_sentiment_file)
        tweets_sentiment_filepath = os.path.join(tweets_sentiments_directory, tweets_sentiment_filename)
        sentiment = get_tweet_sentiment_from_file_name(tweets_sentiment_filename)
        tweets_for_sentiment: List[Tweet] = read_tweet_file(tweets_sentiment_filepath, sentiment)
        #print("Tweets for sentiment: ", sentiment, "\n")
        for tweet in tweets_for_sentiment:
            tweet_info: TweetInfo = TweetInfo(sentiment, sentiments)
            tweet.tweet_stem_count = TweetInfo
            tweets_to_info[tweet] = tweet_info

## Stem counting
For each tweet and each word of them is checked the sentiment and increased the counter for that sentiment in the TweetInfo object associated

In [9]:
for tweet in tweets_to_info:
    tweet_info = tweets_to_info[tweet]
    tweet_words: List[str] = tweet.get_words()

    for word in tweet_words:
        if word in lex_word_to_sentiment:
            # get the sentiment for the word and increase sentiment counter by 1
            sentiment = lex_word_to_sentiment[word]
            tweet_info.increase_sentiment_counter(sentiment)

### Test print

In [10]:
def print_tweets():
    for tweet in tweets_to_info.keys():
        info = tweets_to_info[tweet]
        print(tweet)
        print("sentiment: " + info.sentiment)
        print("sentiment occurrences: ")
        print(info.sentiment_occurrences)
        print("---")

In [11]:
print_tweets()

Tweet
	tweet raw: yes the cillo is very chill smoking on the balcony	pos tags: {"yes": "RB", "cillo": "NN", "chill": "JJ", "smoking": "VBG", "balcony": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 1}
---
Tweet
	tweet raw: wow i'm having fun doing the smoking	pos tags: {"wow": "NN", "'m": "VBP", "fun": "NN", "smoking": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: yea yea	pos tags: {"yea": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: boss	pos tags: {"boss": "NN"}

sentiment: cillo
sentiment occurrences: 
{'Anger': 0, 'Joy': 0}
---
Tweet
	tweet raw: angry pensa is angry sad banana no	pos tags: {"angry": "JJ", "pensa": "NN", "sad": "JJ", "banana": "NN"}

sentiment: pensa
sentiment occurrences: 
{'Anger': 3, 'Joy': 1}
---
Tweet
	tweet raw: angry boella no pensa kill yoyou ah rip bu	pos tags: {"angry": "JJ", "boella": "NN", "pensa": "NN", "kill": "VB", "ah": "JJ", "rip": "NN

## Connection to MongoDB

In [12]:
already_connected = False
if not already_connected:
    mongo_client = pymongo.MongoClient("mongodb+srv://Peppino:wHzr79JxnRUgK52@cluster0.zkagq.mongodb.net/?retryWrites=true&w=majority")
    mydb = mongo_client["maadb_tweets"]
    print(mydb.list_collection_names())

coll_list = mydb.list_collection_names()


[print(i) for i in lex_resources_list]

['Tweets', 'LexResourcesWords', 'LexResources']
LexicalResource: angry
	 sentiment: Anger
	 wordlist: ['no', 'pensa', 'angry', 'bu']
LexicalResource: pensa_angry.txt
	 sentiment: Anger
	 wordlist: ['bbu', 'bubu', 'ya', 'bu']
LexicalResource: happy
	 sentiment: Joy
	 wordlist: ['yes', 'boella', 'privacy', 'pensa']


[None, None, None]

### Check in which resources each word is contained
Creates a dictionary <word, lex_res_list> to map each word with the lexical resources which contains the word

In [13]:
map_word_lex_res: Dict[str, List[str]] = {}

for word in lex_word_to_sentiment:
    for lex_res in lex_resources_list:
        if word in lex_res.word_list:
            if map_word_lex_res.get(word) is None:
                map_word_lex_res[word] = [lex_res.filename]
            else:
                map_word_lex_res[word].append(lex_res.filename)

print(map_word_lex_res)

{'bu': ['angry', 'pensa_angry.txt'], 'ya': ['pensa_angry.txt'], 'pensa': ['angry', 'happy'], 'no': ['angry'], 'angry': ['angry'], 'bbu': ['pensa_angry.txt'], 'bubu': ['pensa_angry.txt'], 'yes': ['happy'], 'privacy': ['happy'], 'boella': ['happy']}


### Flags to manage queries

In [14]:
delete_lex_res = True
insert_lex_res = True

delete_lex_res_words = True
insert_lex_res_words = True

delete_tweets = True
insert_tweets = True

### Insert/delete Lexical Resources

In [15]:
db_lex_res_collection = mydb["LexResources"]

if delete_lex_res:
    db_lex_res_collection.delete_many({})

if insert_lex_res:
    for lex_res in lex_resources_list:
        to_upload = {"_id" : lex_res.filename,
                     "sentiment": lex_res.sentiment,
                     "totNumberWords" : lex_res.get_number_of_words()}
        inserted_lex_res = db_lex_res_collection.insert_one(to_upload)
        print(inserted_lex_res.inserted_id)
        # until here inserted lexical resources basic information in LexRes collection

angry
pensa_angry.txt
happy


### Insert/delete words of lexical resources

In [16]:
map_lex_word_db_id: Dict[str, int] = {}
db_lex_res_words_collection = mydb["LexResourcesWords"]

if delete_lex_res_words:
    db_lex_res_words_collection.delete_many({})

if insert_lex_res_words:
    # for each word in all the lexical resources insert in LexResWords the word and a
    # list of pairs <$ref, $id> to track in which LexRes the word is contained
    for word in lex_word_to_sentiment:
        list_lex_res = map_word_lex_res[word] # list of lexical resources in which the word is contained
        resources = [] # list of pairs to insert in LexResWords

        for res in list_lex_res: # populate list adding, one at a time, the lexical resources in which the word is contained
            resources.append({"$ref": "LexResources", "$id": res})

        word_to_upload = {"lemma" : word,
                          "resources" : resources}
        inserted_lex_res_word = db_lex_res_words_collection.insert_one(word_to_upload)
        map_lex_word_db_id[word] = inserted_lex_res_word.inserted_id # save object id to use it later to reference resources words from tweet words
        print(word_to_upload)

print("\n\n", map_lex_word_db_id)

{'lemma': 'bu', 'resources': [{'$ref': 'LexResources', '$id': 'angry'}, {'$ref': 'LexResources', '$id': 'pensa_angry.txt'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0a8')}
{'lemma': 'ya', 'resources': [{'$ref': 'LexResources', '$id': 'pensa_angry.txt'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0a9')}
{'lemma': 'pensa', 'resources': [{'$ref': 'LexResources', '$id': 'angry'}, {'$ref': 'LexResources', '$id': 'happy'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0aa')}
{'lemma': 'no', 'resources': [{'$ref': 'LexResources', '$id': 'angry'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0ab')}
{'lemma': 'angry', 'resources': [{'$ref': 'LexResources', '$id': 'angry'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0ac')}
{'lemma': 'bbu', 'resources': [{'$ref': 'LexResources', '$id': 'pensa_angry.txt'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0ad')}
{'lemma': 'bubu', 'resources': [{'$ref': 'LexResources', '$id': 'pensa_angry.txt'}], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0ae')}
{'lemma': 'yes', 'resources': [{'$ref': '

### Insert/delete tweets

In [17]:
db_tweets_collection = mydb["Tweets"]

if delete_tweets:
    db_tweets_collection.delete_many({})

if insert_tweets:
    for tweet in tweets_to_info:
        tweet_words_upload = []
        for word in tweet.pos_tags:
            if map_lex_word_db_id.get(word) is None:
                # Decide what to do with words that do not have a lexical resource associated, we could think about associating it to a resource or some other strategy.
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : "None"})
            else:
                tweet_words_upload.append({
                    "lemma": word,
                    "POS": tweet.pos_tags[word],
                    "freq": tweet.word_frequency[word],
                    "in_lex_resources" : {"$ref": "LexResourcesWords", "$id": map_lex_word_db_id[word]}})

        tweet_to_upload = {
            "sentiment": tweet.sentiment,
            "index": tweet.index,
            "words" : tweet_words_upload,
            "hashtags" : tweet.hashtags,
            "emojis" : tweet.emojis,
            "emoticons" : tweet.emoticons}

        inserted_tweets = db_tweets_collection.insert_one(tweet_to_upload)

        print(tweet)
        print(tweet_to_upload, "\n\n")

Tweet
	tweet raw: yes the cillo is very chill smoking on the balcony	pos tags: {"yes": "RB", "cillo": "NN", "chill": "JJ", "smoking": "VBG", "balcony": "NN"}

{'sentiment': 'cillo', 'index': 1, 'words': [{'lemma': 'yes', 'POS': 'RB', 'freq': 1, 'in_lex_resources': {'$ref': 'LexResourcesWords', '$id': ObjectId('62b4ca11df0ce1d4bc0fb0af')}}, {'lemma': 'cillo', 'POS': 'NN', 'freq': 1, 'in_lex_resources': 'None'}, {'lemma': 'chill', 'POS': 'JJ', 'freq': 1, 'in_lex_resources': 'None'}, {'lemma': 'smoking', 'POS': 'VBG', 'freq': 1, 'in_lex_resources': 'None'}, {'lemma': 'balcony', 'POS': 'NN', 'freq': 1, 'in_lex_resources': 'None'}], 'hashtags': [], 'emojis': [], 'emoticons': [":')", ':-)', ':)', ';)'], '_id': ObjectId('62b4ca11df0ce1d4bc0fb0b2')} 


Tweet
	tweet raw: wow i'm having fun doing the smoking	pos tags: {"wow": "NN", "'m": "VBP", "fun": "NN", "smoking": "NN"}

{'sentiment': 'cillo', 'index': 2, 'words': [{'lemma': 'wow', 'POS': 'NN', 'freq': 1, 'in_lex_resources': 'None'}, {'lemma

In [18]:
# db_tweets_collection = mydb["Tweets"]
#
# if delete_tweets:
#     db_tweets_collection.delete_many({})
#
# if insert_tweets:
#     for tweet in tweets_to_info:
#         tweet_words_upload = []
#         for word in tweet.pos_tags:
#             if map_lex_word_db_id.get(word) is None:
#                 # Decide what to do with words that do not have a lexical resource associated, we could think about associating it to a resource or some other strategy.
#                 tweet_words_upload.append({
#                     "lemma": word,
#                     "POS": tweet.pos_tags[word],
#                     "freq": tweet.word_frequency[word],
#                     "in_lex_resources" : "None"})
#             else:
#                 tweet_words_upload.append({
#                     "lemma": word,
#                     "POS": tweet.pos_tags[word],
#                     "freq": tweet.word_frequency[word],
#                     "in_lex_resources" : {"$ref": "LexResourcesWords", "$id": map_lex_word_db_id[word]}})
#
#         tweet_to_upload = {
#             "sentiment": tweet.sentiment,
#             "index": tweet.index,
#             "words" : tweet_words_upload,
#             "hashtags" : tweet.hashtags,
#             "emojis" : tweet.emojis,
#             "emoticons" : tweet.emoticons}
#
#         inserted_tweets = db_tweets_collection.insert_one(tweet_to_upload)
#
#         print(tweet)
#         print(tweet_to_upload, "\n\n")

## Connection to MariaDB

In [19]:
already_connected_mariadb = False
# Connect to MariaDB Platform
if not already_connected_mariadb:
    try:
        conn = mariadb.connect(
            user="root",
            password="armando12",
            host="localhost",
            port=3306,
            database="maadb_tweets"
        )
    except mariadb.Error as e:
        print(f"Error connecting to MariaDB Platform: {e}")
        sys.exit(1)

    # Get Cursor
    cur = conn.cursor()

cur.execute("SHOW TABLES")

for (table_name,) in cur:
    print(table_name)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error connecting to MariaDB Platform: Access denied for user 'root'@'localhost' (using password: YES)
Traceback (most recent call last):
  File "C:\Users\amato\AppData\Local\Temp/ipykernel_10576/985685160.py", line 5, in <module>
    conn = mariadb.connect(
mariadb.OperationalError: Access denied for user 'root'@'localhost' (using password: YES)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\amato\appdata\local\programs\python\python39\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\amato\AppData\Local\Temp/ipykernel_10576/985685160.py", line 14, in <module>
    sys.exit(1)
SystemExit: 1

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\amato\appdata\local\programs\python\python39\lib\site-packages\IPython\core\ultratb.py", line 1101, in get_re

TypeError: object of type 'NoneType' has no len()

In [None]:
# db_tweets_collection = mydb["Tweets"]
#
# pipeline = [
#     # First Stage
#     {
#         "$group" :
#             {
#                 "_id" : "$item",
#                 "sum_of_something": { "$sum": { "$multiply": [ "$price", "$quantity" ] } }
#             }
#     },
#
#     # Second Stage
#     {
#        "$match": { "sum_of_something": { "$gte": 100 } }
#     }
# ]
#
# mydb.db_tweets_collection.aggregate(pipeline)