# 1.Librerias

In [264]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os 
import json
import glob
from PIL import Image


# Lo ejecuto por unica vez
# import nltk
# nltk.download('vader_lexicon')


# 2.Datos

In [267]:
BASE_DIR = './'

PATH_TO_TRAIN = os.path.join(BASE_DIR, "data/train/train.csv")
#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_artifacts")
PATH_TO_IMAGES = os.path.join(BASE_DIR, "data/train_images")
PATH_TO_SENTIMENTS = os.path.join(BASE_DIR, "data/train_sentiment")
PATH_TO_SENTIMENTS_SAVE = os.path.join(BASE_DIR, "data/train")


SEED = 55 #Semilla de procesos aleatorios (para poder replicar exactamente al volver a correr un modelo)
BATCH_SIZE= 50
TEST_SIZE = 0.2 #Facción para train/test= split

In [268]:
train = pd.read_csv('./data/train/train.csv')
color_labels=pd.read_csv('./data/color_labels.csv')
state_labels=pd.read_csv('./data/state_labels.csv')
breed_labels=pd.read_csv('./data/breed_labels.csv')

train = train.merge(breed_labels, left_on=['Breed1', 'Type'], right_on=['BreedID', 'Type'], how='left', suffixes=('', '_PrimaryBreed'))
train = train.merge(breed_labels, left_on=['Breed2', 'Type'], right_on=['BreedID', 'Type'], how='left', suffixes=('', '_SecondaryBreed'))


train_image_files = sorted(glob.glob('./data/train_images/*.jpg'))
train_sentiments_files = sorted(glob.glob('./data/train_sentiment/*.json'))

Agrego el campo objetivo

In [269]:

# Crear los límites para las categorías
bins = [-1, 3, 6, 12, 24, 36, 72, 144, np.inf]  # Empieza en -1 para incluir el 0

# Crear los nombres de las categorías
labels = [0,1,2,3,4,5,6,7]

# Crear la nueva columna 'age_target' utilizando pd.cut
train['Age_target'] = pd.cut(train['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

In [270]:
train.head(3)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,BreedID,BreedName,BreedID_SecondaryBreed,BreedName_SecondaryBreed,Age_target
0,2,Nibble,3,299,0,1,1,7,0,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,299.0,Tabby,,,0
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,265.0,Domestic Medium Hair,,,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,307.0,Mixed Breed,,,0


# 3.Analisìs de Images and Sentiments 

In [271]:
print('Numero de Imagenes para procesar: {}'.format(len(train_image_files)))
print('Numero de Archivos JSON para procesar: {}'.format(len(train_sentiments_files)))


# Images
trainIds= train[['PetID']]
train_df_images = pd.DataFrame(train_image_files)
train_df_images.columns = ['image_filename']
train_images_pet = train_df_images['image_filename'].apply(lambda x: x.split('/')[-1].split('-')[0])
train_df_images = train_df_images.assign(PetId=train_images_pet)

pet_with_images = len(np.intersect1d(train_images_pet.unique(), trainIds['PetID'].unique() ))
print('Porcentaje de mascotas con imagenes: {:.3f}'.format(pet_with_images/ trainIds.shape[0]))

# Sentiments
trainIds= train[['PetID']]
train_df_sentiments = pd.DataFrame(train_sentiments_files)
train_df_sentiments.columns = ['sentiment_filename']
train_sentiment_pet = train_df_sentiments['sentiment_filename'].apply(lambda x: x.split('/')[-1].split('.')[0])
train_df_sentiments = train_df_sentiments.assign(PetId=train_sentiment_pet)

pet_with_sentiment = len(np.intersect1d(train_sentiment_pet.unique(), trainIds['PetID'].unique() ))
print('Porcentaje de mascotas con archivos de sentimientos: {:.3f}'.format(pet_with_sentiment/ trainIds.shape[0]))



Numero de Imagenes para procesar: 58311
Numero de Archivos JSON para procesar: 14442
Porcentaje de mascotas con imagenes: 0.977
Porcentaje de mascotas con archivos de sentimientos: 0.963


Funciones para tratar los archivos

In [272]:
def open_sentiment_file(filename):
    if os.path.exists(filename):
        with open(filename) as f:
            sentiment_data = json.load(f)
            return sentiment_data
    else:
        return None
       
        
def open_images_file(filename):
    image = np.asarray(Image.open( filename))
    return image

def interprete_sentence(scoreValue):
    if scoreValue is None:
        return "unknown"
    elif scoreValue> 0.25:
        return "postive"
    elif scoreValue< 0.25:
        return "negative"
    else:
        return "Neutral"

Lectura de los archivos JSON con los resultados del sentimental analysis de Natural Lenguaje features API 

documentSentiment
1. score of the sentiment ranges between -1.0 (negative) and 1.0 (positive) and corresponds to the overall emotional leaning of the text.
2. magnitude indicates the overall strength of emotion (both positive and negative) within the given text, between 0.0 and +inf. Unlike score, magnitude is not normalized for documentSentiment; each expression of emotion within the text (both positive and negative) contributes to the text's magnitude (so longer text blocks may have greater magnitudes)

sentences

1. Sentiment contains the sentence level sentiment values attached to each sentence, which contain score between -1.0 (negative) and 1.0 (positive) as and magnitude values between 0.0 and 1.0. Note that magnitude for sentences is normalized.



In [273]:
# Lista para almacenar los resultados
data = []

# Recorremos todos los archivos en el directorio
for archivo in train_sentiment_pet.unique():
    sentiment_data = open_sentiment_file(os.path.join(PATH_TO_SENTIMENTS, archivo +'.json'))  
    # Extraer el score y magnitude de 'documentSentiment'
    doc_sentiment_score = sentiment_data['documentSentiment']['score']
    doc_sentiment_magnitude = sentiment_data['documentSentiment']['magnitude']
    entity_name = ' '.join(entity['name'] for entity in sentiment_data.get('entities', []))
        
    # Recorrer cada oración dentro del campo 'sentences'
    for sentence in sentiment_data.get('sentences', []):
        sentence_text = sentence['text']['content']
        sentence_sentiment_score = sentence['sentiment']['score']
        sentence_sentiment_magnitude = sentence['sentiment']['magnitude']
        # Guardar los datos en la lista

        data.append({
            'PetID': archivo,
            'doc_sentiment_score': doc_sentiment_score,
            'doc_score': interprete_sentence(doc_sentiment_score),
            'doc_sentiment_magnitude': doc_sentiment_magnitude,
            'doc_magnitude': interprete_sentence(doc_sentiment_magnitude),
            'entity_name': entity_name,
            'sentence_sentiment_score': sentence_sentiment_score,
            'sentence_score': interprete_sentence(sentence_sentiment_score),
            'sentence_sentiment_magnitude': sentence_sentiment_magnitude,
            'sentence_magnitude': interprete_sentence(sentence_sentiment_magnitude)
            })

# Convertir los datos a un DataFrame de pandas
df = pd.DataFrame(data)
df.to_csv(os.path.join(PATH_TO_SENTIMENTS_SAVE, 'train_sentimentFE.csv'),index=False)


In [274]:
df.head(3)

Unnamed: 0,PetID,doc_sentiment_score,doc_score,doc_sentiment_magnitude,doc_magnitude,entity_name,sentence_sentiment_score,sentence_score,sentence_sentiment_magnitude,sentence_magnitude
0,0008c5398,0.7,postive,2.8,postive,Ollie construction site house manja type playm...,0.0,negative,0.0,negative
1,0008c5398,0.7,postive,2.8,postive,Ollie construction site house manja type playm...,0.9,postive,0.9,postive
2,0008c5398,0.7,postive,2.8,postive,Ollie construction site house manja type playm...,0.9,postive,0.9,postive


Agrupo caracteristicas extraidas por PETID.

Del archivo JSON voy almacenar:
1. documentSentiment: score, magnitude.
2. sentences: scrore, magnitude.
3. entities: name.
4. Creo una columna mean y sum para cada caracteristica

In [275]:
train_dfFE = pd.read_csv(os.path.join(PATH_TO_SENTIMENTS_SAVE, 'train_sentimentFE.csv'))
train_dfFE = train_dfFE.drop(['doc_score','doc_magnitude','sentence_score','sentence_magnitude'], axis=1)

In [276]:
train_dfFE

Unnamed: 0,PetID,doc_sentiment_score,doc_sentiment_magnitude,entity_name,sentence_sentiment_score,sentence_sentiment_magnitude
0,0008c5398,0.7,2.8,Ollie construction site house manja type playm...,0.0,0.0
1,0008c5398,0.7,2.8,Ollie construction site house manja type playm...,0.9,0.9
2,0008c5398,0.7,2.8,Ollie construction site house manja type playm...,0.9,0.9
3,0008c5398,0.7,2.8,Ollie construction site house manja type playm...,0.8,0.8
4,000a290e4,0.3,0.6,restaurant puppies beach Call teluk kumba Adop...,0.1,0.1
...,...,...,...,...,...,...
73880,fff6f2f61,0.5,3.3,puppies adoption terrier pups mongrel skin pro...,0.9,0.9
73881,fff6f2f61,0.5,3.3,puppies adoption terrier pups mongrel skin pro...,0.1,0.1
73882,fffd78a11,0.8,1.6,people litter box kids kitten,0.9,0.9
73883,fffd78a11,0.8,1.6,people litter box kids kitten,0.7,0.7


In [277]:
train_sentiment_desc = train_dfFE[['PetID', 'entity_name']].drop_duplicates()
train_dfFE = train_dfFE.drop(['entity_name'], axis=1)

In [278]:
function_list = ['mean', 'sum']

for i in train_dfFE.columns:
    if 'PetID' not in i:
        train_dfFE[i] = train_dfFE[i].astype(float)
train_dfFE = train_dfFE.groupby(['PetID']).agg(function_list)


train_dfFE.columns = pd.Index(['{}_{}'.format(
            c[0], c[1].upper()) for c in train_dfFE.columns.tolist()])
train_dfFE = train_dfFE.reset_index()

train_dfFE.head(5)

Unnamed: 0,PetID,doc_sentiment_score_MEAN,doc_sentiment_score_SUM,doc_sentiment_magnitude_MEAN,doc_sentiment_magnitude_SUM,sentence_sentiment_score_MEAN,sentence_sentiment_score_SUM,sentence_sentiment_magnitude_MEAN,sentence_sentiment_magnitude_SUM
0,0008c5398,0.7,2.8,2.8,11.2,0.65,2.6,0.65,2.6
1,000a290e4,0.3,0.6,0.6,1.2,0.3,0.6,0.3,0.6
2,000fb9572,0.3,0.6,0.8,1.6,0.35,0.7,0.35,0.7
3,0011d7c25,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8
4,00156db4a,0.0,0.0,1.8,9.0,0.02,0.1,0.34,1.7


Merge de los dos dataset

In [279]:
train_dfFE = train_dfFE.merge(train_sentiment_desc, how='left', on='PetID')
train= train.merge(train_dfFE,how='left',on='PetID')

In [280]:
train.head(5)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Age_target,doc_sentiment_score_MEAN,doc_sentiment_score_SUM,doc_sentiment_magnitude_MEAN,doc_sentiment_magnitude_SUM,sentence_sentiment_score_MEAN,sentence_sentiment_score_SUM,sentence_sentiment_magnitude_MEAN,sentence_sentiment_magnitude_SUM,entity_name
0,2,Nibble,3,299,0,1,1,7,0,1,...,0,0.3,1.8,2.4,14.4,0.3,1.8,0.366667,2.2,Nibble cuteness clinic cats result kitty coupl...
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0,-0.2,-0.4,0.7,1.4,-0.25,-0.5,0.35,0.7,apartment care
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,0.2,1.4,3.7,25.9,0.2,1.4,0.485714,3.4,mother owner puppies roadside shops Subang Jay...
3,1,Miko,4,307,0,2,1,2,0,2,...,1,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,guard dog master obedience call sms details
4,1,Hunter,1,307,0,1,1,0,0,2,...,0,0.6,3.6,3.7,22.2,0.583333,3.5,0.583333,3.5,boy adoption Hunter pal puppies love age brat ...


Completo los campos NA y vuelvo a guardar el archivo con el analisis

In [281]:
# completo con MISSING los campos nulos de entity name
train['entity_name'] = train['entity_name'].fillna('<MISSING>')
train['Description'] = train['Description'].fillna('<MISSING>')
train.to_csv(os.path.join(PATH_TO_SENTIMENTS_SAVE, 'train_sentimentFE.csv'),index=False)

In [282]:
train.head(4)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Age_target,doc_sentiment_score_MEAN,doc_sentiment_score_SUM,doc_sentiment_magnitude_MEAN,doc_sentiment_magnitude_SUM,sentence_sentiment_score_MEAN,sentence_sentiment_score_SUM,sentence_sentiment_magnitude_MEAN,sentence_sentiment_magnitude_SUM,entity_name
0,2,Nibble,3,299,0,1,1,7,0,1,...,0,0.3,1.8,2.4,14.4,0.3,1.8,0.366667,2.2,Nibble cuteness clinic cats result kitty coupl...
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0,-0.2,-0.4,0.7,1.4,-0.25,-0.5,0.35,0.7,apartment care
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,0.2,1.4,3.7,25.9,0.2,1.4,0.485714,3.4,mother owner puppies roadside shops Subang Jay...
3,1,Miko,4,307,0,2,1,2,0,2,...,1,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,guard dog master obedience call sms details


In [283]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

text_columns = ['Description', 'entity_name']
X_text= train[text_columns]
n_components = 5
text_features = []


# Generate text features:
for i in X_text.columns:
    
    # Initialize decomposition methods:
    print('generating features from: {}'.format(i))
    svd_ = TruncatedSVD(
        n_components=n_components, random_state=1337)
    nmf_ = NMF(
        n_components=n_components, random_state=1337)
    
    tfidf_col = TfidfVectorizer().fit_transform(X_text.loc[:, i].values)
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(i))
    
    nmf_col = nmf_.fit_transform(tfidf_col)
    nmf_col = pd.DataFrame(nmf_col)
    nmf_col = nmf_col.add_prefix('NMF_{}_'.format(i))
    
    text_features.append(svd_col)
    text_features.append(nmf_col)

    
# Combine all extracted features:
text_features = pd.concat(text_features, axis=1)

# Concatenate with main DF:
train = pd.concat([train, text_features], axis=1)

# Remove raw text columns:
for i in X_text.columns:
    train = train.drop(i, axis=1)

generating features from: Description
generating features from: entity_name


In [284]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 56 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Type                               14993 non-null  int64   
 1   Name                               13728 non-null  object  
 2   Age                                14993 non-null  int64   
 3   Breed1                             14993 non-null  int64   
 4   Breed2                             14993 non-null  int64   
 5   Gender                             14993 non-null  int64   
 6   Color1                             14993 non-null  int64   
 7   Color2                             14993 non-null  int64   
 8   Color3                             14993 non-null  int64   
 9   MaturitySize                       14993 non-null  int64   
 10  FurLength                          14993 non-null  int64   
 11  Vaccinated                         14993 

elimino columnas que no voy a utilizar

In [285]:
train = train.drop(['PetID', 'Name', 'RescuerID'], axis=1)

factorizo las columnas raza de mascotas

In [286]:
for i in ['BreedName', 'BreedName_SecondaryBreed']:
    train.loc[:, i] = pd.factorize(train.loc[:, i])[0]

guardo el archivo final

In [287]:
train.to_csv(os.path.join(PATH_TO_SENTIMENTS_SAVE, 'train_sentimentFE_final.csv'),index=False)