# Data Modelling

**Author:** [Giuseppe Tripodi](https://www.linkedin.com/in/giuseppe-tripodi-unical/)<br>
**Date created:** 2022/11/12<br>
**Description:** This script allows you to load the electoral programs, the tweets and the speeches, do the preprocessing and create a suitable data structure to fine-tune the models


# Setup

## Install package

In [1]:
!pip install datasets transformers
!pip install sentencepiece
!pip install sacremoses
!pip install nltk
!pip install transformers
!pip install deepmultilingualpunctuation==1.0.0

[0mCollecting deepmultilingualpunctuation==1.0.0
  Downloading deepmultilingualpunctuation-1.0.0-py3-none-any.whl (5.3 kB)
Installing collected packages: deepmultilingualpunctuation
Successfully installed deepmultilingualpunctuation-1.0.0
[0m

## Import Libraries

In [2]:
import json
import os
import csv
import re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from transformers import pipeline
from nltk.stem import WordNetLemmatizer
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
from deepmultilingualpunctuation import PunctuationModel


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


## Support Functions

In [35]:
def remove_unwanted_char_function(text: str) -> str:
    """
    removes unwanted char, unwanted text and puts everything on lowercase
    :param text:
    :return: text after preprocessing
    """   
    # remove the link inside the text 
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    
    # remove hastag
    text = re.sub(r"@","", text)
    
    # Remove aremove non-English letters
    text = re.sub(r'[^\x00-\x7f]',r'', text)

    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)

    # Converting to Lowercase
    text = text.lower()
    return text


In [36]:
text = f"qui aula di giustizia del carcere di palermo. il processo voluto dalla sinistra e dai tifosi dellimmigrazione clandestina comincia: quanto coster ai cittadini italiani? https://t.co/lkuujlsofd"
remove_unwanted_char_function(text)

'qui aula di giustizia del carcere di palermo. il processo voluto dalla sinistra e dai tifosi dellimmigrazione clandestina comincia: quanto coster ai cittadini italiani? '

In [37]:
def lemmatizer(text: str) -> str:
    """
    Takes the text and does the lemmatization of the text, to reduce each words to
    the relative radix
    :param text:
    :return: string with the text after the stemming
    """
    lemmatizer = WordNetLemmatizer()
    ret = []
    for token in word_tokenize(text):
        ret.append(lemmatizer.lemmatize(token))
    return " ".join(ret)


In [38]:
def stemmer(text: str) -> str:
    """
    Takes the text and does the stemming of the text, to reduce each words to
    the relative radix
    :param text:
    :return: string with the text after the stemming
    """
    snowball = SnowballStemmer(language='english')
    list = []
    for token in word_tokenize(text):
        list.append(snowball.stem(token))
    return ' '.join(list)



In [39]:
model_checkpoint = "Helsinki-NLP/opus-mt-it-en"
it_en_translator = pipeline("translation", model=model_checkpoint)
def translate_to_en(text: str) -> str:
    """
    takes a text in italian and return the text in english
    :param text: string, italian text
    :return: english test
    """
    #since the text can only translate block of 512 words i split the text and translatte the block by itself
    translated_text = ""
    phrases= text.split(".")
    i = 0 #phrases index
    while i < len(phrases): 
        chunk = ""
        while len(chunk.split()) < 300 and i < len(phrases):
            chunk = chunk + phrases[i]
            i += 1
        # the chunk has the right number of words, we can summarize it
        try:
            translated_text += it_en_translator(chunk)[0]["translation_text"].strip()
        except Exception as e:
            print(f"Error while translating chunk: {chunk}")
            print(e)
    return translated_text

In [40]:
newsum = pipeline("summarization", model='it5/it5-base-news-summarization')
def summarize(text: str) -> str:
    """
    gets a text and return the abstractive summarization
    :param text:
    :return:
    """
    #since the text can only summarize block of 512 words i split the text and summarize the block by itself
    summarized_text = ""
    phrases= text.split(".")
    i = 0 #phrases index
    while i < len(phrases): 
        chunk = ""
        while len(chunk.split()) < 300 and i < len(phrases):
            chunk = chunk + phrases[i]
            i += 1
        # the chunk has the right number of words, we can summarize it
        try:
            summarized_text += newsum(chunk)[0]["summary_text"].strip()
        except Exception as e:
            print(f"Error while summarizing chunk: {chunk}")
            print(e)
    # all the phrases has been summarized, now summarizes the whole text
    
    try:
        ret =  newsum(summarized_text, min_length=120, max_length=240)[0]["summary_text"].strip()
    except Exception as e:
            print(f"Error while summarizing text: {summarized_text}")
            print(e)
    return ret
    

In [41]:
def extractive_summarization(text: str) -> str:
    """
    Gets a text and return an extractive summarization of the text

    code from: https://stackabuse.com/text-summarization-with-nltk-in-python/
    """
    # preprocessing
    # Removing Square Brackets and Extra Spaces
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Removing special characters and digits
    formatted_article_text = re.sub('[^a-zA-Z]', ' ', text)
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

    # converting text to sentences
    sentence_list = sent_tokenize(text, language="italian")

    # Find Weighted Frequency of Occurrence
    stopwords = nltk.corpus.stopwords.words('italian')

    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_article_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    # divide the number of occurances of all the words by the frequency of the most occurring word
    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word] / maximum_frequncy)

    # Calculating Sentence Scores
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    # retrieves top_n sentences  and return it
    top_n = 10
    summary_sentences = heapq.nlargest(top_n, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

In [42]:
def splitter(n, s):
    """
    code from: https://stackoverflow.com/questions/3861674/split-string-by-number-of-words-with-python

    takes a long string s and divides it in bloks of length n
    :param n:
    :param s:
    :return:
    """
    pieces = s.split()
    return (" ".join(pieces[i:i+n]) for i in range(0, len(pieces), n))

def splitter_by_dot(s):
    """
    takes a long string and splits it by the dots without splitting the decimal numbers
    :param s:
    :return:
    """
    ret = re.split('["."][^0-9]', s)
    # preprocess each phrase
    for i in range(len(ret)):
        # remove the link inside the text
        ret[i] = re.sub(r'http\S+', '', ret[i])

        # Substituting multiple spaces with single space
        ret[i] = re.sub(r'\s+', ' ', ret[i], flags=re.I)

        # Removing prefixed 'b'
        ret[i] = re.sub(r'^b\s+', '', ret[i])

        # Converting to Lowercase
        ret[i] = ret[i].lower()
    return ret

In [43]:
def preprocess_text(text: str, summarize_text=True, translate_text=True, remove_unwanted_char=True, lemmatization=True) -> str:
    """
    Preprocess the text in the electoral program, delete unwanted char.
    Then it does the lemmarization
    Then it reduces the size of the text by doin
    :param translate_text:
    :param summarize_text:
    :param text:
    :return: text after preprocessing
    """
    print(f"preprocess text: {text[:10]}...")
    try:        
        #summarize italian text
        if summarize_text:
            #text = summarize(text) # abstractive summarization
            text = extractive_summarization(text) # extractive summarization

      # Translate text to en
        if translate_text:
            text = translate_to_en(text)
        
        # Remove unwanted characters
        if remove_unwanted_char:
            text = remove_unwanted_char_function(text)
        
        # lemmatizing text
        if lemmatization:
            text = lemmatizer(text)
        return text
    except:
        print(f"Error in text: {text}")
        return "None"

In [44]:
def decompose_dataframe_by_text(df: pd.DataFrame, text_field_name:str, phrase_length:int) -> pd.DataFrame:
    """
    code from: https://stackoverflow.com/questions/3861674/split-string-by-number-of-words-with-python

    Takes a dataframe and return the same dataframe decomposed by text, the text is divided
    by phrase and for each phrase is created a new row.
    :param phrase_length: number of words of the phrase you want to extract
    :param text_field_name:
    :param df:
    :return:
    """
    ret = pd.DataFrame(columns=df.columns)
    for index, row in df.iterrows():

        # split the phrase by the number of words
        #texts = splitter(phrase_length, row[text_field_name])

        # split the phrase by dots
        #texts = splitter_by_dot(row[text_field_name])
        texts = nltk.sent_tokenize(row[text_field_name], "italian")
        for text in texts:
            # splits the phrase too long
            if len(text.split()) > 500:
                short_phrases = splitter(250, text)
                for short in short_phrases:
                    new_row = row
                    new_row[text_field_name] = short
                    ret = ret.append(new_row, ignore_index=True)

            # consider only the phrase with more than 10 words
            elif len(text.split()) >= 10:
                new_row = row
                new_row[text_field_name] = text
                ret = ret.append(new_row, ignore_index=True)

    return ret

In [None]:
def add_punctualization(text:str):
    model = PunctuationModel("oliverguhr/fullstop-punctuation-multilang-large")
    text = model.restore_punctuation(text.lstrip())
    text = re.sub(r"\[.*?\]", "", text)
    return text

# Electoral Programs Modelling

In [45]:
def load_programs_json(path: str) -> dict:
    """
    Load the json file from path, and return it. Used to load the electoral
    programs from json file after data extraction
    :param path:
    :return: file as a dict
    """
    with open(path) as file:
        ret = json.load(file)
    return ret

In [46]:
def save(output_path: str, rows: [], fieldnames: []):
    with open(output_path, "w", encoding="UTF8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


In [47]:
def make_structures_topic_similarity(PATH: str, screen_names: [], summarize_text=True, translate_text=True, remove_unwanted_char=True, lemmatization=True):
    """
    takes the file indicated by the screen names and put all the contents in a single dataframe. After this it just
    adds the label indicator. This process is done for every single screen name
    :param translate_text:
    :param summarize_text:
    :param text_field_name: name of the field of the text in the csv files
    :param screen_names: array with the names of the programs file
    :param PATH: general path where are located all the programs
    :return: array
    """
    for screen_name in screen_names:
        print(f"Retrieving content from {screen_name} file\n")

        # load the content, the content could be either the speeches program or the tweets
        content = pd.read_csv(os.path.join(PATH, screen_name))

        #preprocessing the columns
        for col in content.columns:
            content[col] = content[col].apply(lambda x : preprocess_text(x, summarize_text=summarize_text, translate_text=translate_text, remove_unwanted_char=remove_unwanted_char, lemmatization=lemmatization))


        print(f"{content.info()}\n")
        # save the dataframe
        content.to_csv(f"./{screen_name}", index=False)

In [48]:
def make_structures_by_category_text_class_2(PATH: str, screen_names: [], output_path, summarize_text, translate_text, remove_unwanted_char, lemmatization):
    """
    takes the file indicated by the screen names and put all the contents in a single dataframe. After this it just
    adds the label indicator. Generates the test set for the text classification 2
    :param translate_text:
    :param summarize_text:
    :param text_field_name: name of the field of the text in the csv files
    :param screen_names: array with the names of the programs file
    :param PATH: general path where are located all the programs
    :return: array
    """
    li = []
    indexes = []
    for screen_name in screen_names:
        print(f"Retrieving content from {screen_name} file\n")
        content = pd.read_csv(os.path.join(PATH, screen_name), index_col=None,  header=0)
        # take the class name
        class_name = screen_name[:screen_name.index(".")]
        li.append(content)
        indexes.append(class_name)
    df = pd.concat(li, axis=0, ignore_index=True)
    df["index"] = indexes
    df.set_index("index", inplace=True)

    # Transpose the dataset
    tmp = []
    for column in df.columns:
        for index in df.index:
            tmp.append({"text":df.loc[index][column], "label":index, "category":column})

    df = pd.DataFrame(tmp)

    # divide the text in more phrase
    df = decompose_dataframe_by_text(df, "text", 50)

    # preprocessing text column
    df["text"] = df["text"].apply(lambda x : preprocess_text(x, summarize_text=summarize_text, translate_text=translate_text, remove_unwanted_char=remove_unwanted_char, lemmatization=lemmatization))

    # delete the null rows
    df = df.dropna(axis=0, subset=["text"])
    # save the dataframe
    df.to_csv(f"{output_path}/programs_by_index_by_nltk.csv", index=False)

## main

In [49]:
"""
path = "../input/programs/program_by_index_version_2"
output_path = "./"
screen_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
make_structures_by_category_text_class_2(path, screen_names, output_path,  summarize_text=False, translate_text=False, remove_unwanted_char=True, lemmatization=False)
#make_structures_topic_similarity(path, screen_names, summarize_text=True, translate_text=False, remove_unwanted_char=False, lemmatization=False)
"""


'\npath = "../input/programs/program_by_index_version_2"\noutput_path = "./"\nscreen_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]\nmake_structures_by_category_text_class_2(path, screen_names, output_path,  summarize_text=False, translate_text=False, remove_unwanted_char=True, lemmatization=False)\n#make_structures_topic_similarity(path, screen_names, summarize_text=True, translate_text=False, remove_unwanted_char=False, lemmatization=False)\n'

# Speeches and tweets Modelling

## Common Functions

In [52]:
def make_structure(PATH: str, screen_names: [], text_field_name: str, summarize_text=True, translate_text=True, remove_unwanted_char=True, lemmatization=True) -> []:
    """
    takes the file indicated by the screen names and put all the contents in a single dataframe. After this it just
    adds the label indicator. This process is done for every single screen name
    :param translate_text:
    :param summarize_text:
    :param text_field_name: name of the field of the text in the csv files
    :param screen_names: array with the names of the programs file
    :param PATH: general path where are located all the programs
    :return: array
    """
    ret = []
    for pol_name in screen_names:
        print(f"Retrieving content from {pol_name} file\n")
        # take the class name
        class_name = pol_name[:pol_name.index(".")]
        # load the content, the content could be either the speeches program or the tweets
        content = pd.read_csv(os.path.join(PATH, pol_name))
        
        
        # add punctualization divide the transcript in more phrases (Only for speeches)
        #content[text_field_name] = content[text_field_name].apply(lambda x: add_punctualization(x))
        #content = decompose_dataframe_by_text(content, text_field_name, phrase_length=50)

        #preprocessing text column
        content[text_field_name] = content[text_field_name].apply(lambda x : preprocess_text(x, summarize_text=summarize_text, translate_text=translate_text, remove_unwanted_char=remove_unwanted_char, lemmatization=lemmatization))

        #add label column
        content.insert(len(content.columns), "label", class_name)
        print(f"{content.info()}\n")
        ret.append(content)

    return pd.concat(ret)

In [53]:
def save(output_path: str, dataframe: pd.DataFrame):
    """
    The function takes a dataframe and save it in the output path file
    :param output_path:
    :param dataframe:
    :return:
    """
    with open(output_path, "w", encoding="UTF8", newline="") as f:
        dataframe.to_csv(f)


## Speeches Modelling Main

In [54]:
"""
path_speech = f"../input/political-speeches"
screen_names_speech = [f for f in os.listdir(path_speech) if os.path.isfile(os.path.join(path_speech, f))]

#create the structures
print("Make Structure for speeches\n")
for screen_name in screen_names_speech:
    name = [screen_name]
    speech = make_structure(path_speech, name, "transcript", summarize_text=False, translate_text=False, remove_unwanted_char=False, lemmatization=False)

    # save the speeches
    filename_speech = f"political_speech_{screen_name}"
    save(filename_speech, speech)
"""

'\npath_speech = f"../input/political-speeches"\nscreen_names_speech = [f for f in os.listdir(path_speech) if os.path.isfile(os.path.join(path_speech, f))]\n\n#create the structures\nprint("Make Structure for speeches\n")\nfor screen_name in screen_names_speech:\n    name = [screen_name]\n    speech = make_structure(path_speech, name, "transcript", summarize_text=False, translate_text=False, remove_unwanted_char=False, lemmatization=False)\n\n    # save the speeches\n    filename_speech = f"political_speech_{screen_name}"\n    save(filename_speech, speech)\n'

## Tweets Modelling Main

In [55]:
path_tweet = f"../input/tweets"
screen_names_tweets = [f for f in os.listdir(path_tweet) if os.path.isfile(os.path.join(path_tweet, f))]

In [56]:

print("Make Structure for tweets\n")
for screen_name in screen_names_tweets:
    name = [screen_name]
    tweets = make_structure(path_tweet, name, "text", summarize_text=False, translate_text=False, remove_unwanted_char=True, lemmatization=False)
    
    #save the tweets
    filename_tweets = f"political_tweets_{screen_name}"
    save(filename_tweets, tweets)

Make Structure for tweets

Retrieving content from MatteoRenzi_25-09-21_to_25-09-2022.csv file

preprocess text: È stato un...
preprocess text: Il mio int...
preprocess text: Vicenda Ru...
preprocess text: Per il par...
preprocess text: Noi voglia...
preprocess text: L’Europa è...
preprocess text: Gli altri ...
preprocess text: Nel 2021 i...
preprocess text: Ci voglion...
preprocess text: L’Italia v...
preprocess text: Che bello ...
preprocess text: In diretta...
preprocess text: Giuseppe C...
preprocess text: Anche a Fr...
preprocess text: A Napoli c...
preprocess text: Un appello...
preprocess text: Il mio inv...
preprocess text: Tanta gent...
preprocess text: Bellissimo...
preprocess text: In diretta...
preprocess text: I 5 Stelle...
preprocess text: Il #25sett...
preprocess text: Il sindaco...
preprocess text: Anche oggi...
preprocess text: In diretta...
preprocess text: Alle 11.30...
preprocess text: Grazie Pal...
preprocess text: #ItaliaSic...
preprocess text: Giuseppe C...
prepr