In [None]:
!pip install nltk

# Preprocessing Function

In [1]:
import pandas as pd
import string
import unicodedata
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krlozz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krlozz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Krlozz\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))
caracteres_especiales = set(string.punctuation)

In [4]:
simbolos_adicionales = ['±', '½', '‰', '„', '“', '³', '¼', '»', 'º', '´',
                        'ℝ', 'ℕ', 'ℚ', 'ℂ', 'ℤ', 'π', 'α', 'β', 'γ', 'δ',
                        'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'µ', 'ν', 'ξ',
                        'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 
                        'ℓ', 'ϵ', 'ʽ', 'ʼ', 'œ', 'ʻ', 'ς', 'ﬁ', 'ﬂ', '\n', '\r']

In [5]:
caracteres_especiales.update(simbolos_adicionales)

In [6]:
tweet_tokenizer = TweetTokenizer()

In [7]:
def limpiar_texto(texto):
    def remover_tildes(s):
        normalized = unicodedata.normalize('NFD', s)
        return ''.join(
            c for c in normalized
            if unicodedata.category(c) != 'Mn' and
            not (ord(c) >= 768 and ord(c) <= 879)
        ).replace('ŉ', 'n')
    
    def procesar_especiales(palabra):
        if palabra.startswith('@') or palabra.startswith('#'):
            contenido = palabra[1:]
            if contenido.isalpha() and len(contenido) > 2:
                return contenido
            return None
        if not palabra.isalpha() or len(palabra) <= 2:
            return None
        return palabra
    
    texto = texto.replace('ñ', 'ni').replace('\n', ' ').replace('\r', ' ').replace('-', ' ')
    
    palabras = tweet_tokenizer.tokenize(texto)
    palabras_procesadas = [
        procesar_especiales(remover_tildes(palabra.lower()))
        for palabra in palabras
        if palabra.lower() not in stop_words and 
           len(palabra) > 2 and
           not any(char.isdigit() for char in palabra)
    ]
    
    palabras_procesadas = [palabra for palabra in palabras_procesadas if palabra is not None]
    
    palabras_sin_especiales = [
        ''.join(caracter for caracter in palabra if caracter not in caracteres_especiales)
        for palabra in palabras_procesadas
    ]
    
    return ' '.join(palabra.strip() for palabra in palabras_sin_especiales if palabra)

In [9]:
#test función
frase = "Este câsa-câsa íno ŉationalisation Î¼atm caí es       un 0.1Î¼gL-1  2023b 61472 #20b #numero #μς \n10% %ejemplo Pingüino de cómo la función debe limpiar y procesar el texto correctamente, mañana. Any facts that might be perceived as a possible conflict of interest of the author(s) must be disclosed in the paper prior to submission.(Test), ¿paper? and ¡letter! XsD @Hashtag #number"

In [10]:
frase

'Este câsa-câsa íno ŉationalisation Î¼atm caí es       un 0.1Î¼gL-1  2023b 61472 #20b #numero #μς \n10% %ejemplo Pingüino de cómo la función debe limpiar y procesar el texto correctamente, mañana. Any facts that might be perceived as a possible conflict of interest of the author(s) must be disclosed in the paper prior to submission.(Test), ¿paper? and ¡letter! XsD @Hashtag #number'

In [11]:
texto_limpio = limpiar_texto(frase)

In [12]:
print(texto_limpio)

casa casa ino nationalisation atm cai numero ejemplo pinguino como funcion debe limpiar procesar texto correctamente maniana facts might perceived possible conflict interest author must disclosed paper prior submission test paper letter xsd hashtag number


# Processed Data

## articles clean

In [13]:
articles_path = 'C:/Users/Krlozz/Documents/Tesis/data_JA/articles_v2.csv'
articles_df = pd.read_csv(articles_path, delimiter=';')

In [14]:
articles_df["identifier"]

0        85133492759
1        85133293730
2        85132518705
3        85112575431
4        85109263966
            ...     
39946    33646049406
39947    77957209355
39948    84941812731
39949    84944669127
39950    84944670009
Name: identifier, Length: 39951, dtype: int64

In [15]:
articles_df.rename(columns={'identifier': 'id_article'}, inplace=True)

In [16]:
articles_df['title'] = articles_df['title'].fillna('').apply(limpiar_texto)
articles_df['abstract'] = articles_df['abstract'].fillna('').apply(limpiar_texto)

In [17]:
cleaned_df = articles_df[(articles_df['abstract'] != '') & (articles_df['abstract'].notna())]

In [18]:
cleaned_df = cleaned_df[['id_article', 'title', 'abstract']]

## keywords clean

In [19]:
keywords_path = 'C:/Users/Krlozz/Documents/Tesis/data_JA/articles_author_keywords.csv'
keywords_df = pd.read_csv(keywords_path, delimiter=',')

In [20]:
keywords_df["author_keyword"]

0          Audio signals design process
1         Experimental design processes
2                      Fictional spaces
3              Sound and changing forms
4               Facility layout problem
                      ...              
162824                       Solanaceae
162825              Solanum pseudoquina
162826            stereoidal alkaloids.
162827                   Endemic goiter
162828                    Thyroglobulin
Name: author_keyword, Length: 162829, dtype: object

In [21]:
keywords_df['cleaned_keywords'] = keywords_df['author_keyword'].apply(limpiar_texto)

In [22]:
keywords_df['cleaned_keywords']

0          audio signals design process
1         experimental design processes
2                      fictional spaces
3                  sound changing forms
4               facility layout problem
                      ...              
162824                       solanaceae
162825              solanum pseudoquina
162826             stereoidal alkaloids
162827                   endemic goiter
162828                    thyroglobulin
Name: cleaned_keywords, Length: 162829, dtype: object

In [23]:
keywords_count = keywords_df.groupby('article_id')['cleaned_keywords'].agg(['count', lambda x: ', '.join(x)])
keywords_count.rename(columns={'count': 'num_keywords', '<lambda_0>': 'keywords'}, inplace=True)

In [24]:
cleaned_df = cleaned_df.merge(keywords_count, how='left', left_on='id_article', right_index=True)

In [25]:
#valores NaN en num_keywords
unique_num_keywords = cleaned_df['num_keywords'].unique()
value_counts = cleaned_df['num_keywords'].value_counts(dropna=False)
most_frequent_value = value_counts.idxmax()
cleaned_df['num_keywords'].fillna(most_frequent_value, inplace=True)

In [26]:
output_articles = 'C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/article_clean.csv'
cleaned_df.to_csv(output_articles, index=False)