In [1]:
from src.LexicalResource import LexicalResource
from src.Tweet import TweetInfo, Tweet
# Se ne abbiamo voglia possiamo mettere le emoticons e gli emoji su file e per poi leggerli

import os
from typing import List, Dict, Set


from time import perf_counter
from datetime import timedelta

ALLOW_PRINT = True
N_TWEET_TO_READ = 10
LEX_RESOURCES_DIRECTORY_LOCAL = "../resources/lex_res/"
TWEETS_DIRECTORY_LOCAL = "../resources/tweets/"

## Pipeline

## (function) Read files in directory
General function to read text files from a directory and merge them

In [2]:
lex_resources_list: List[LexicalResource] = []

def read_texts_in_directory(directory_path: str, sentiment: str) -> List[str]:
    files_text_list: List[str] = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file = open(file=file_path, encoding="utf8")
            file_text = file.read().split() # list of words for a single lex resource of a sentiment
            files_text_list = files_text_list + file_text

            lex_res: LexicalResource = LexicalResource(filename, sentiment)
            lex_res.add_word_list(file_text)
            global lex_resources_list
            lex_resources_list.append(lex_res)

    # print(len(lex_resources_list))
    # [print(i) for i in lex_resources_list]
    return files_text_list

## (function) Read lexical resources for a sentiment
Function which reads all the lexical resources for a sentiment
The directory containing all lexical resources for that sentiment is passed as parameter
Returns a set of the words in all the lexical resources of a sentiment
### forse creare per ogni lex res di OGNI sentimento un dizionario diverso? Bisogna vedere come caricare i dati su db, bisogna caricare ogni lex res diversa di ogni sentimento sul db

In [3]:
def read_lex_resources_sentiment(sentiment_lex_resources_directory: str, sentiment: str) -> Set[str]:
    resource_words: Set[str] = set()
    resources_text: List[str] = read_texts_in_directory(sentiment_lex_resources_directory, sentiment)
    for word in resources_text:
        if not '_' in word:
            resource_words.add(word)
    #print(sentiment, "\n", resource_words, "\n\n")
    return resource_words

### Read all lexical resources

Reads all the lexical resources and returns a dictionary of word to sentiment

In [4]:
time_lex_res_start = perf_counter()

In [5]:
sentiment_lex_resources: Dict[str, str] = {}

for resources_path, sentiments, _ in os.walk(LEX_RESOURCES_DIRECTORY_LOCAL):
    # The folders inside the lexical resources folder are named after a sentiment (Ex. Anger, Joy), each of them contain some files and each of them is a list of words that are associated with that sentiment
    for sentiment in sentiments:
        # iterate each folder (one for sentiment)
        resources_sentiment_path = os.path.join(resources_path, sentiment)
        sentiment_words_set: Set[str] = read_lex_resources_sentiment(resources_sentiment_path, sentiment)

        # read the files containing lists of words, and return a set of all the words in those files
        for sentiment_word in sentiment_words_set:
            # associate each word of the set to the corresponding sentiment
            sentiment_lex_resources[sentiment_word] = sentiment

lex_word_to_sentiment = sentiment_lex_resources
if ALLOW_PRINT:
    print(lex_word_to_sentiment)



In [6]:
time_lex_res_end = perf_counter()
time_lex_res = time_lex_res_end - time_lex_res_start
print("Elapsed time for loading lexical resources: ", str(timedelta(seconds=time_lex_res)))

Elapsed time for loading lexical resources:  0:00:00.063091


## Tweet reading

## (function) Reads a file and converts the text to tweets


In [7]:
def read_tweet_file(file_path_string: str, sentiment: str, n_tweet_to_read: int) -> List[Tweet]:
    """
    Reads a file and converts the text to tweets
    :param file_path_string: string of the path to the file
    """

    # tweets read from file
    tweets_read: List[Tweet] = []

    tweets_file = open(file=file_path_string, encoding="utf8")
    tweets_text: List[str] = tweets_file.readlines()

    # For each tweet text create a Tweet object
    if n_tweet_to_read > len(tweets_text):
        n_tweet_to_read = len(tweets_text)
    for tweet_index in range(0, n_tweet_to_read):
        new_tweet = Tweet(tweets_text[tweet_index], tweet_index+1, sentiment)
        tweets_read.append(new_tweet)

    return tweets_read

Get list of sentiments

In [8]:
sentiments: List[str] = [sentiment for sentiment in os.listdir(LEX_RESOURCES_DIRECTORY_LOCAL)]

## Read tweets folder and load Tweet Info for stem counting
The tweets folder contains for each sentiment a file containing tweets of that sentiment. Each file is scanned and for each tweet a TweetInfo object is created in order to maintain the count of how many word of which sentiments are in it

In [9]:
def get_tweet_sentiment_from_file_name(file_name: str):
    extension_removed = file_name.split(".")[0]
    sentiment = extension_removed.split("_")[-2]
    return sentiment

In [10]:
time_tweets_start = perf_counter()

In [11]:
tweets_to_info: Dict[Tweet, TweetInfo] = {}
for tweets_sentiments_directory, _, tweets_sentiments_filenames in os.walk(TWEETS_DIRECTORY_LOCAL):

    for tweets_sentiment_filename in tweets_sentiments_filenames:
        tweets_sentiment_filepath = os.path.join(tweets_sentiments_directory, tweets_sentiment_filename)
        sentiment = get_tweet_sentiment_from_file_name(tweets_sentiment_filename)
        tweets_for_sentiment: List[Tweet] = read_tweet_file(tweets_sentiment_filepath, sentiment, N_TWEET_TO_READ)
        for tweet in tweets_for_sentiment:
            tweet_info: TweetInfo = TweetInfo(sentiment, sentiments)
            tweet.tweet_stem_count = TweetInfo
            tweets_to_info[tweet] = tweet_info

In [12]:
time_tweets_end = perf_counter()
time_tweets = time_tweets_end - time_tweets_start
print("Elapsed time for loading tweets: ", str(timedelta(seconds=time_tweets)))

Elapsed time for loading tweets:  0:00:01.705947


## Stem counting
For each tweet and each word of them is checked the sentiment and increased the counter for that sentiment in the TweetInfo object associated

In [13]:
for tweet in tweets_to_info:
    tweet_info = tweets_to_info[tweet]
    tweet_words: List[str] = tweet.get_words()

    for word in tweet_words:
        if word in lex_word_to_sentiment:
            # get the sentiment for the word and increase sentiment counter by 1
            sentiment = lex_word_to_sentiment[word]
            tweet_info.increase_sentiment_counter(sentiment)

### Test print

In [14]:
def print_tweets():
    for tweet in tweets_to_info.keys():
        info = tweets_to_info[tweet]
        print(tweet)
        print("sentiment: " + info.sentiment)
        print("sentiment occurrences: ")
        print(info.sentiment_occurrences)
        print("---")

In [15]:
if ALLOW_PRINT:
    print_tweets()

Tweet
	tweet raw: fyouwhat that spell ? fired up ? noo haha
	pos tags: {"fyouwhat": "RB", "spell": "NN", "fire": "VBD", "noo": "NNS", "haha": "NN"}
	emojis: []
	emoticons: []
	hashtags: ['fuckyou']
	tokens: ['fyouwhat', 'spell', 'fired', 'noo', 'haha']
	words frequency: {'fyouwhat': 1, 'that': 0, 'spell': 1, 'fire': 1, 'up': 0, 'noo': 1, 'haha': 1}

sentiment: anger
sentiment occurrences: 
{'Anger': 0, 'Anticipation': 0, 'Disgust': 0, 'Fear': 0, 'Joy': 1, 'Sadness': 0, 'Surprise': 0, 'Trust': 0}
---
Tweet
	tweet raw: now all of you roll tide bandwagon fans will hop off alabama di seek . one wor d: overrated .
	pos tags: {"roll": "VBP", "tide": "RB", "bandwagon": "NN", "fan": "NNS", "hop": "VB", "alabama": "NN", "di": "NN", "seek": "NN", "one": "CD", "wor": "NN", "overrate": "VBN"}
	emojis: []
	emoticons: []
	hashtags: []
	tokens: ['all', 'you', 'roll', 'tide', 'bandwagon', 'fans', 'hop', 'alabama', 'di', 'seek', 'one', 'wor', 'overrated']
	words frequency: {'now': 0, 'all': 1, 'of': 0,

### Check in which resources each word is contained
Creates a dictionary <word, lex_res_list> to map each word with the lexical resources which contains the word

In [16]:
map_word_lex_res: Dict[str, List[str]] = {}

for word in lex_word_to_sentiment:
    for lex_res in lex_resources_list:
        if word in lex_res.words:
            if map_word_lex_res.get(word) is None:
                map_word_lex_res[word] = [lex_res.filename]
            else:
                map_word_lex_res[word].append(lex_res.filename)

if ALLOW_PRINT:
    print(map_word_lex_res)



### Flags to manage queries

In [17]:
delete_lex_res = False
insert_lex_res = False

delete_lex_res_words = False
insert_lex_res_words = False

delete_tweets = False
insert_tweets = False

# SQL

In [18]:
from src.LexicalResource import LexicalResource
from src.MySql import DBConnection

### Connection

In [19]:
db_connection = DBConnection()
db_connection.connect_to_db()

## Insert lexical resources

In [20]:
if delete_lex_res:
    db_connection.delete_lex_res()
if insert_lex_res:
    db_connection.insert_lexical_resources(lex_resources_list)

## Insert tweets

In [21]:
if delete_tweets:
    db_connection.delete_tweets()
    foreign_key_query1 = "SET FOREIGN_KEY_CHECKS = 0;"
    delete_tokens = "TRUNCATE token;"
    foreign_key_query2 = "SET FOREIGN_KEY_CHECKS = 1;"
    db_connection.launch_query(foreign_key_query1)
    db_connection.launch_query(delete_tokens)
    db_connection.launch_query(foreign_key_query2)

if insert_tweets:
    tweets_list = []
    for tweet in tweets_to_info:
        tweets_list.append(tweet)

    #[print(tweet) for tweet in tweets_list]
    db_connection.insert_tweets(tweets_list)

In [22]:
pipeline2_test = db_connection.pipeline2(lex_resources_list[0])
print(pipeline2_test)

354
0
0.0


In [23]:
# from matplotlib import pyplot as plt
# from wordcloud import WordCloud
# import string
#
# # the regex used to detect words is a combination of normal words, ascii art, and emojis
# # 2+ consecutive letters (also include apostrophes), e.x It's
# normal_word = r"(?:\w[\w']+)"
# # 2+ consecutive punctuations, e.x. :)
# ascii_art = r"(?:[{punctuation}][{punctuation}]+)".format(punctuation=string.punctuation)
# # a single character that is not alpha_numeric or other ascii printable
# emoji = r"(?:[^\s])(?<![\w{ascii_printable}])".format(ascii_printable=string.printable)
# font_path = 'resources/Symbola.otf'
# regexp = r"{normal_word}|{ascii_art}|{emoji}".format(normal_word=normal_word, ascii_art=ascii_art,
#                                                      emoji=emoji)
#
#
#
# for sentiment in sentiments:
#     result = db_connection.pipeline1(10, sentiment.lower(), "word")
#     tot_words = 0
#     words_to_check = {}
#
#     for pair in result:
#         tot_words = tot_words + pair[1]
#
#     for pair in result:
#         words_to_check[pair[0]] = pair[1]/tot_words
#
#     print(words_to_check)
#     wordcloud = WordCloud(width=500, height=500,
#                           background_color='white',
#                           min_font_size=10,
#                           font_path=font_path,
#                           regexp=regexp).generate_from_frequencies(words_to_check)
#
#     # plot the WordCloud image
#     plt.figure(figsize=(5, 5), facecolor=None)
#     plt.imshow(wordcloud)
#     plt.axis("off")
#     plt.tight_layout(pad=0)
#     plt.title("word cloud for sentiment: " + sentiment)
#     plt.show()

In [24]:
# for sentiment in sentiments:
#     result = db_connection.pipeline1(10, sentiment.lower())
#     print(result)
#     tot_words = 0
#     words_to_check = {}
#
#     for pair in result:
#         tot_words = tot_words + pair[1]
#
#     for pair in result:
#         words_to_check[pair[0]] = pair[1]/tot_words
#
#     print(words_to_check)
#     wordcloud = WordCloud(width=500, height=500,
#                           background_color='white',
#                           min_font_size=10,
#                           font_path=font_path,
#                           regexp=regexp).generate_from_frequencies(words_to_check)
#
#     # plot the WordCloud image
#     plt.figure(figsize=(5, 5), facecolor=None)
#     plt.imshow(wordcloud)
#     plt.axis("off")
#     plt.tight_layout(pad=0)
#     plt.title("word cloud for sentiment: " + sentiment)
#     plt.show()

In [25]:
# db_connection.delete_tweets()
#
# foreign_key_query1 = "SET FOREIGN_KEY_CHECKS = 0;"
# delete_tokens = "TRUNCATE token;"
# foreign_key_query2 = "SET FOREIGN_KEY_CHECKS = 1;"
# db_connection.launch_query(foreign_key_query1)
# db_connection.launch_query(delete_tokens)
# db_connection.launch_query(foreign_key_query2)
#
# tweet1: Tweet = Tweet("USERNAME know what she ain't 🙅👌 don't lol even need to say it he likes it !", 0, "anger")
# tweet2: Tweet = Tweet("angry Boella no 😒 Pensa kill lol #armando you ah rip bu it was better 😂", 0, "anger")
# tweet3: Tweet = Tweet("angry Pensa is imho imho imho imho imho imho ;( imho imho imho imho imho angry pensa sad #gervaso banana no", 0, "anger")
#
# db_connection.insert_tweets([tweet1, tweet2, tweet3])



In [26]:
# for sentiment in sentiments:
#     result = db_connection.pipeline1(10, sentiment.lower())
#     print(result)
#     tot_words = 0
#     words_to_check = {}
#
#     for pair in result:
#         tot_words = tot_words + pair[1]
#
#     for pair in result:
#         words_to_check[pair[0]] = pair[1]/tot_words
#
#     print(words_to_check)
#     wordcloud = WordCloud(width=500, height=500,
#                           background_color='white',
#                           min_font_size=10,
#                           font_path=font_path,
#                           regexp=regexp).generate_from_frequencies(words_to_check)
#
#     # plot the WordCloud image
#     plt.figure(figsize=(5, 5), facecolor=None)
#     plt.imshow(wordcloud)
#     plt.axis("off")
#     plt.tight_layout(pad=0)
#     plt.title("word cloud for sentiment: " + sentiment)
#     plt.show()