# Limpieza de técnica de arte

In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
datos = pd.read_csv('../datasets/clean/clean_mut_art_ver2.csv', index_col=0)
datos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10907 entries, 807 to 20123
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        10907 non-null  object 
 1   height       10853 non-null  float64
 2   length       10747 non-null  float64
 3   art_type     10907 non-null  object 
 4   medium_text  10907 non-null  object 
 5   date_text    10907 non-null  int64  
 6   age          10907 non-null  int64  
 7   house        10907 non-null  object 
 8   fecha        10907 non-null  object 
 9   price        10451 non-null  float64
 10  author       10907 non-null  object 
 11  performance  10907 non-null  int64  
 12  source       10907 non-null  object 
 13  country      10907 non-null  object 
dtypes: float64(3), int64(3), object(8)
memory usage: 1.2+ MB


In [3]:
# Aquí contamos todas las instancias de las distintas palabras
aux2 = pd.DataFrame(datos.assign(list_aux=lambda df: df.medium_text.apply(str.split)).list_aux.to_list())
aux3 = pd.DataFrame([np.nan])

for col in aux2.columns:
    aux3 = pd.concat((aux3, aux2[col]), ignore_index=True)
    
aux3.dropna(inplace=True)
aux3.reset_index(drop=True, inplace=True)
aux3.rename(columns={0:'word'}, inplace=True)

In [4]:
# Aquí tenemos las palabras de interés
words = (aux3
         .word
         .apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
         .apply(lambda x: x.lower())
         .reset_index()
         .groupby('word')
         .size()
         .reset_index()
         .sort_values(0, ascending=False)
         .rename(columns={0:'cuenta'})
         .assign(len_word=lambda x: x.word.str.len())
         .query('len_word > 2')
         .query('cuenta > 100')
         .sort_values('word')
         .word
         .values
        )

In [5]:
aux = datos.assign(list_technique=lambda df: df
             .medium_text
             .apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
             .apply(lambda x: x.lower())
             .apply(lambda x: x.split())
            )

for word in words:
    aux = (aux
             .assign(word_aux=lambda df: df
                     .list_technique
                     .apply(lambda x: word in x)
                     .astype('int8')
                    )
             .rename(columns={'word_aux':word})
           )
    
aux.drop(columns=['medium_text']).head(1)

Unnamed: 0,title,height,length,art_type,date_text,age,house,fecha,price,author,...,rods,sand,screenprint,silkscreen,silver,tempera,watercolor,with,wood,wove
807,"""Experiencia"" by Jerson Jimenez, 2021",127.0,40.0,Charity Auction of the Salvatorians,2021,1,"Dorotheum, Vienna",2022-09-21,705.0,Jerson Jimenez,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def clean_technique(df):
    # Creamos una tabla con todas las palabras disponibles
    aux2 = pd.DataFrame(df.assign(list_aux=lambda df: df.medium_text.apply(str.split)).list_aux.to_list())

    # Pasamos todo a una sola columna que se llame word
    aux3 = pd.DataFrame([np.nan])

    for col in aux2.columns:
        aux3 = pd.concat((aux3, aux2[col]), ignore_index=True)

    aux3.dropna(inplace=True)
    aux3.reset_index(drop=True, inplace=True)
    aux3.rename(columns={0:'word'}, inplace=True)

    # Extraemos las palabras más importantes
    words = (aux3
             .word
             .apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
             .apply(lambda x: x.lower())
             .reset_index()
             .groupby('word')
             .size()
             .reset_index()
             .sort_values(0, ascending=False)
             .rename(columns={0:'cuenta'})
             .assign(len_word=lambda x: x.word.str.len())
             .query('len_word > 2')
             .query('cuenta > 100')
             .sort_values('word')
             .word
             .values
            )

    # Volvemos a separar en lista de palabras
    aux = df.assign(list_technique=lambda df: df
                 .medium_text
                 .apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
                 .apply(lambda x: x.lower())
                 .apply(lambda x: x.split())
                )

    # Revisamos qué palabras están en la lista que creamos
    for word in words:
        aux = (aux
                 .assign(word_aux=lambda df: df
                         .list_technique
                         .apply(lambda x: word in x)
                         .astype('int8')
                        )
                 .rename(columns={'word_aux':word})
               )

    # Quitamos las columnas que ya no sirven
    aux.drop(columns=['medium_text'], inplace=True)
    aux.reset_index(drop=True, inplace=True)
    
    # Cambiamos las opciones de pandas para que se puedan ver todas las palabras
    pd.options.display.max_columns=999
    return aux

In [7]:
result = clean_technique(datos)
result.to_csv('../datasets/clean/clean_mut_art_ver3.csv', index=False)
result.head(2)

Unnamed: 0,title,height,length,art_type,date_text,age,house,fecha,price,author,performance,source,country,list_technique,acrylic,and,aquatint,arches,black,board,bronze,brown,canvas,cardboard,charcoal,color,colors,colours,crayon,down,etching,gelatin,gouache,graphite,guarro,handmade,ink,laid,lithograph,lithographs,masonite,media,metal,mixed,mixografía,mixograph,mounted,nylon,oil,painted,panel,paper,pastel,patina,pencil,plexiglas,plexiglass,print,printed,rice,rods,sand,screenprint,silkscreen,silver,tempera,watercolor,with,wood,wove
0,"""Experiencia"" by Jerson Jimenez, 2021",127.0,40.0,Charity Auction of the Salvatorians,2021,1,"Dorotheum, Vienna",2022-09-21,705.0,Jerson Jimenez,-420,Mutual Art,-,"[oil, on, canvas]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Relief 8 by Julio Le Parc, 1970",104.14,41.0,Modern and Contemporary Art,1970,52,"Dorotheum, Vienna",2022-09-20,5154.0,Julio Le Parc,-420,Mutual Art,-,"[screenprint, on, cardboard]",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
