# 3.0 - Data Integration

In [1]:
import os
import nltk
import pathlib
import pandas as pd

in_path = pathlib.Path('../data/processed/responses')

In [2]:
df_articles = pd.read_parquet('../data/processed/articles_data.parquet', engine = 'pyarrow')

In [3]:
df_responses = pd.DataFrame()
for _file in os.listdir(in_path):
    if _file.endswith('.jsonl'):
        f_name = os.path.join(in_path, _file)
        df_temp = pd.read_json(f_name, lines=True)
        df_responses = pd.concat([df_responses, df_temp], axis = 0).reset_index(drop = True)

In [4]:
df_all = df_articles.merge(df_responses, left_on = 'id', right_on = 'custom_id', how = 'left', suffixes = ('', '_y'))

In [5]:
df_all = df_all[df_all['custom_id'].notnull()]

In [6]:
def clean_text(row):
    return row['body']['choices'][0]['message']['content']
df_all['text cleaned'] = df_all['response'].apply(clean_text)

In [7]:
# error in a date, try to fix it
df_all.loc[df_all['object_pub_date'] == '0201-12-19 10:46:09', 'object_pub_date'] = '2021-12-19 10:46:09'

In [8]:
df_all['title'] = df_all['object_title']
df_all['category'] = df_all['domain_category']
df_all['post type'] = df_all['object_post_type']
df_all['url'] = df_all['object_url']
df_all['text'] = df_all['object_text']
df_all['links words rate'] = df_all['links_words_rate']
df_all['publication date'] = pd.to_datetime(df_all['object_pub_date'], format = '%Y-%m-%d %H:%M:%S').dt.date

In [223]:
def clean_text_gpt(row):
    text_chunked = [content.strip() for content in row.replace('? ', '? |').replace('\n', '|').replace('\r', '|').replace('\t', '|').replace('. ', ' | ').split('|') if content]
    return [sentence for sentence in text_chunked if sentence]

In [9]:
def clean_text_nltk(row):
    sent_text = row.replace('|', ' . ')
    sent_text = nltk.sent_tokenize(sent_text)
    return sent_text

In [10]:
# add new columns with the text chunked by sentence
df_all['text chunked'] = df_all['text cleaned'].apply(lambda x: clean_text_nltk(x))

In [11]:
count = 0
indexes = []
for _, row in df_all.iterrows():
    if len(row['text chunked']) == 1:
        count += 1
        indexes.append(_)
print(count, indexes)

8 [657, 1127, 11760, 13023, 25356, 25359, 27359, 27642]


In [12]:
# remove rows with only one sentence (video, photogallery, file download). no text
df_all = df_all.drop(indexes, axis = 0)

In [14]:
df_cleaned = df_all[[
    'id', 'title', 'publication date',
    'category', 'post type', 'domain','url', 
    'topics','links words rate', 
    'text', 'text cleaned', 'text chunked']]

In [26]:
df_cleaned.to_parquet('../data/processed/cleaned_data.parquet', engine = 'pyarrow', index = False)