In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import dateparser
from csv import writer
import re
import nltk
# Garbage collector library (to manage memory)
import gc

In [None]:
# Execute this cell if you have not downloaded these packages yet
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

## Converting XML files into a data frame

### Use the XML files to generate a CSV file

In [None]:
def export_to_csv(output_file):
    """Takes all the posts in the blogs directory, merges all the information and puts it in a CSV file."""
    post_regex = re.compile('<post>\\s*(.*)\\s*</post>')
    # Selection of XML files
    files = [x for x in os.listdir("blogs") if x.endswith('.xml')]
    # Current index to be printed
    with open(output_file, 'w+') as outfile:
        outfile.write("Id,Gender,Age,Industry,Zodiac,Date,Post\n")
        writer_obj = writer(outfile)
        for f in files:
            # Get information from the file name
            id, gender, age, industry, zodiac = f.split('.')[:5]
            with open(os.path.join('blogs', f), 'r', encoding='utf-8', errors='ignore') as file_stream:
                data = file_stream.read()
                if not data:
                    print('...')
                # Extract the date and the content of the post
                bs_data = BeautifulSoup(data, "xml")
                list_dates = bs_data.find_all('date')
                list_posts = bs_data.find_all('post')
                for i, date_tag in enumerate(list_dates):
                    tag_content = date_tag.contents[0]
                    try:
                        # Convert the date in yyyy-mm-dd format
                        date_obj = dateparser.parse(tag_content, date_formats=['%d,%B,%Y'])
                        if date_obj is not None:
                            formatted_date = date_obj.strftime('%Y-%m-%d')
                        else:
                            formatted_date = None
                        post = list_posts[i]
                    except ValueError as e:
                        formatted_date = None
                        post = list_posts[i]
                        print(f"Value error: {e}")
                    if post is not None:
                        # Delete the <post> and </post> tags
                        m = re.match(post_regex, str(post))
                        if m is not None:
                            post = m.group(1)
                    # Add a row in the file
                    writer_obj.writerow([id, gender, age, industry, zodiac, formatted_date, post])

In [None]:
export_to_csv('data.csv')

### Generate the n-grams and add them in the data frame

In [None]:
def get_ngrams(text, n=2):
    '''
    Get the n-grams of a given text.

    Parameters:
        text (str): Text where n-grams are extracted
        n (int): Number of elements of n-grams
    Returns:
        ngrams (list): List of n-grams
    '''
    text = str(text)
    tokens = nltk.word_tokenize(text)
    ngrams = nltk.ngrams(tokens, n, pad_left="_", pad_right="_")
    del text
    del tokens
    gc.collect()
    return list(ngrams)

def add_ngrams(df):
    '''
    Adds the n-grams into the data frame. A column containing the n-grams
    is added into the data frame.

    Parameters:
        df (DataFrame): data frame where n-grams are added
    Returns:
        df (DataFrame): data frame with the n-grams
    '''
    posts = df['Post'].values
    df['Ngrams'] = [get_ngrams(p) for p in posts]
    del posts
    gc.collect()
    return df

def get_pos(text):
    '''
    Get the n-grams of a given text.

    Parameters:
        text (str): Text where POS are extracted
    Returns:
        tags (list): List of tuples, each tuple containing a token and its POS
    '''
    text = str(text)
    tokens = nltk.word_tokenize(text)
    tags = nltk.tag.pos_tag(tokens, tagset='universal')
    del text
    del tokens
    gc.collect()
    return tags

def add_pos(df):
    '''
    Adds the POS into the data frame. A column containing the POS
    is added into the data frame.

    Parameters:
        df (DataFrame): data frame where POS are added
    Returns:
        df (DataFrame): data frame with POS
    '''
    posts = df['Post'].values
    df['Ngrams'] = [get_ngrams(p) for p in posts]
    del posts
    gc.collect()
    return df_copy