# Procesamiento de datos

### El objetivo es limpiar, transformar y preparar los datos para el análisis y modelado posterior

## Importación de librerías

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import string
import re

In [12]:
import nltk
from nltk.corpus import stopwords

In [13]:
# Descargar stopwords si es necesario
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/juancarlos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Como nuestros datos están en inglés usaremos stopwords en inglés

In [14]:
# Configurarando stopwords
stop_words = set(stopwords.words('english'))

## Carga de datos

In [15]:
df_awards = pd.read_csv('../data/processed/data_awards.csv')
df_awards.head()

Unnamed: 0,FileName,AwardTitle,AGENCY,AwardEffectiveDate,AwardExpirationDate,AwardAmount,AbstractNarration
0,2002362.xml,RUI: Terpenes as Versatile Building Blocks fo...,NSF,2020-06-01,2025-05-31,295200,"In this project funded by the Macromolecular, ..."
1,2001671.xml,Conference on the Food-Energy-Water Nexus,NSF,2019-12-01,2020-05-31,30000,This Food-Energy-Water Nexus Conference will b...
2,2002149.xml,PostDoctoral Research Fellowship,NSF,2020-09-01,2024-08-31,150000,This award is made as part of the FY 2020 Math...
3,2001502.xml,Tropical and nonarchimedean analytic methods i...,NSF,2020-09-01,2024-08-31,359739,Algebraic geometry studies solution sets of sy...
4,2002922.xml,Pendant Photochromic Switches Enabling Fluxion...,NSF,2020-07-01,2023-06-30,600000,"With this award, the Macromolecular, Supramole..."


## Manejo de datos faltantes

In [30]:
# Mostrando la cantidad de datos faltantes por columna
missing_data = df_awards.isnull().sum().sort_values(ascending=False)
print(f"Hay {missing_data.values[0]} valores faltantes en la columna '{missing_data.index[0]}'")

Hay 14 valores faltantes en la columna 'AbstractNarration'


In [31]:
df_awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   FileName             1000 non-null   object
 1   AwardTitle           1000 non-null   object
 2   AGENCY               1000 non-null   object
 3   AwardEffectiveDate   1000 non-null   object
 4   AwardExpirationDate  1000 non-null   object
 5   AwardAmount          1000 non-null   int64 
 6   AbstractNarration    986 non-null    object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [32]:
df_awards[df_awards['AbstractNarration'].isnull()]

Unnamed: 0,FileName,AwardTitle,AGENCY,AwardEffectiveDate,AwardExpirationDate,AwardAmount,AbstractNarration
43,2002888.xml,FY 20 Indirect Cost Negotiation IAA,NSF,2020-01-15,2021-01-31,156609,
79,2001499.xml,"New IPA Assignment effective October 15, 2019 ...",NSF,2019-10-15,2022-10-14,661383,
100,2001273.xml,Interagency Agreement between NSF and FPS,NSF,2019-10-15,2020-10-31,4745300,
226,2002833.xml,"OPP/AIL uses the Defense Logistics Agency, Def...",NSF,2019-10-30,2024-08-30,39209874,
240,2001275.xml,IAA with NCUA,NSF,2019-10-15,2020-10-31,23936,
247,2001029.xml,FY20 IPA Award,NSF,2019-10-30,2020-10-29,229870,
448,2001271.xml,Federal Investigations Reimbursable Billing,NSF,2019-11-15,2024-11-30,680000,
511,2002070.xml,Inter-agency Agreement Fund cite for the NSF D...,NSF,2019-10-16,2020-10-31,3799,
526,2001276.xml,OPM Credit Monitoring Services,NSF,2019-10-15,2020-10-31,4023,
612,2001846.xml,IAA: 2019 Research and Development Survey Oper...,NSF,2019-11-01,2021-11-30,3750504,


In [33]:
# Opciones para manejar datos faltantes:
# 1. Eliminar filas/columnas con datos faltantes
df_awards.dropna(inplace=True)

# 2. Rellenar datos faltantes con un valor específico (ej. la media)
# df_awards.fillna(df_awards.mean(), inplace=True)

# Confirmar que no haya datos faltantes
df_awards.isnull().sum().sort_values(ascending=False)

FileName               0
AwardTitle             0
AGENCY                 0
AwardEffectiveDate     0
AwardExpirationDate    0
AwardAmount            0
AbstractNarration      0
dtype: int64

In [34]:
# Opciones para manejar datos faltantes:
# 1. Eliminar filas/columnas con datos faltantes
df_awards.dropna(inplace=True)

# 2. Rellenar datos faltantes con un valor específico (ej. la media)
# df_awards.fillna(df_awards.mean(), inplace=True)

# Confirmar que no haya datos faltantes
df_awards.isnull().sum().sort_values(ascending=False)

FileName               0
AwardTitle             0
AGENCY                 0
AwardEffectiveDate     0
AwardExpirationDate    0
AwardAmount            0
AbstractNarration      0
dtype: int64

## Limpieza y Normalización de Texto

In [35]:
# Función para limpiar texto
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Aplicar limpieza de texto a las columnas de texto
df_awards['CleanTitle'] = df_awards['AwardTitle'].apply(lambda x: clean_text(str(x)))
df_awards['CleanAbstract'] = df_awards['AbstractNarration'].apply(lambda x: clean_text(str(x)))

# Mostrar algunas filas del DataFrame con las columnas de texto limpio
df_awards[['CleanTitle', 'CleanAbstract']].head()


Unnamed: 0,CleanTitle,CleanAbstract
0,rui terpenes as versatile building blocks for ...,in this project funded by the macromolecular s...
1,conference on the foodenergywater nexus,this foodenergywater nexus conference will be ...
2,postdoctoral research fellowship,this award is made as part of the fy mathemati...
3,tropical and nonarchimedean analytic methods i...,algebraic geometry studies solution sets of sy...
4,pendant photochromic switches enabling fluxion...,with this award the macromolecular supramolecu...


##  Vectorización de Texto

In [39]:
stop_words = list(stop_words)

In [41]:
# Inicializar el vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)

# Ajustar y transformar los datos de texto
tfidf_matrix = tfidf_vectorizer.fit_transform(df_awards['CleanAbstract'])

# Convertir la matriz TF-IDF en un DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Mostrar algunas filas del DataFrame TF-IDF
tfidf_df.head()


Unnamed: 0,ability,able,academic,access,accurate,achieve,across,active,activities,activity,...,workshop,workshops,world,worthy,would,year,years,yet,young,youth
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018414,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088688,0.0,0.0,...,0.0,0.0,0.0,0.025105,0.087981,0.0,0.0,0.0,0.092155,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.043754,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016141,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.058718,0.0,0.0,0.041358,0.0,...,0.0,0.0,0.0,0.016349,0.0,0.0,0.0,0.0,0.0,0.0


## Reducción de Dimensionalidad

In [42]:
# Inicializando PCA para reducir la dimensionalidad a 3 componentes
pca = PCA(n_components=3)
pca_result = pca.fit_transform(tfidf_df)

# Crear un DataFrame con los resultados de PCA
pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3'])

# Mostrar algunas filas del DataFrame PCA
pca_df.head()


Unnamed: 0,PC1,PC2,PC3
0,-0.030563,-0.083572,-0.05283
1,-0.047028,-0.080536,-0.045201
2,0.911343,0.114725,-0.074548
3,0.014416,-0.194908,0.347011
4,-0.053558,-0.109568,-0.055748


## Guardar los Datos Procesados

In [43]:
# Guardar el DataFrame PCA en un archivo CSV
pca_df.to_csv('../data/processed/pca_data.csv', index=False)

# Guardar el DataFrame TF-IDF en un archivo CSV
tfidf_df.to_csv('../data/processed/tfidf_data.csv', index=False)


## Conclusiones y Próximos Pasos


- Se han limpiado y normalizado los textos del dataset.
- Se han convertido los textos en representaciones numéricas utilizando TF-IDF.
- Se ha reducido la dimensionalidad de los datos utilizando PCA.


## Próximos Pasos
1. Implementar un modelo de clustering para agrupar los premios por similitud semántica.
2. Visualizar los clusters resultantes en 3D.
3. Evaluar la calidad de los clusters y ajustar el modelo según sea necesario.
