In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


In [5]:
df = pd.read_csv('janson.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634 entries, 0 to 1633
Data columns (total 37 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ArtistName                       1634 non-null   object 
 1   EditionNumber                    1634 non-null   float64
 2   NumberofWorldCatPublications     0 non-null      float64
 3   ArtistUniqueID                   1634 non-null   int64  
 4   ArtistNationality                1634 non-null   object 
 5   ArtistGender                     1634 non-null   object 
 6   ArtistRace                       1634 non-null   object 
 7   ArtistEthnicity                  1634 non-null   object 
 8   TitleofWork                      1634 non-null   object 
 9   Year                             1634 non-null   int64  
 10  WidthofActualWork(cm)            1593 non-null   object 
 11  HeightofWorkinJanson(cm)         1634 non-null   float64
 12  WidthofWorkinJanson(

In [7]:
df.columns

Index(['ArtistName', 'EditionNumber', 'NumberofWorldCatPublications',
       'ArtistUniqueID ', 'ArtistNationality', 'ArtistGender', 'ArtistRace',
       'ArtistEthnicity', 'TitleofWork', 'Year', 'WidthofActualWork(cm)',
       'HeightofWorkinJanson(cm)', 'WidthofWorkinJanson(cm)',
       'LengthofText(cm)', 'WidthofText(cm)', 'AreaWorkinJanson(cm^2)',
       'AreaTextinJanson(cm^2)', 'TotalSpace(cm^2)', 'PageArea(cm^2)',
       'SpaceRatioPerPage', 'Book', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26',
       'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30',
       'WidthofActualWork(in)', 'Unnamed: 32',
       'HeightofWorkinJanson(in inches)', 'WidthofWorkinJanson(in)',
       'LengthofText(in)', 'WidthofText(in)'],
      dtype='object')

In [8]:
df.sample()

Unnamed: 0,ArtistName,EditionNumber,NumberofWorldCatPublications,ArtistUniqueID,ArtistNationality,ArtistGender,ArtistRace,ArtistEthnicity,TitleofWork,Year,WidthofActualWork(cm),HeightofWorkinJanson(cm),WidthofWorkinJanson(cm),LengthofText(cm),WidthofText(cm),AreaWorkinJanson(cm^2),AreaTextinJanson(cm^2),TotalSpace(cm^2),PageArea(cm^2),SpaceRatioPerPage,Book,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,WidthofActualWork(in),Unnamed: 32,HeightofWorkinJanson(in inches),WidthofWorkinJanson(in),LengthofText(in),WidthofText(in)
341,Edward Steichen,6.0,,61,American,Male,White,Not Hispanic or Latinx,"Rodin with His Sculptures ""Victor Hugo"" and ""T...",2001,32.385,10.4775,12.065,7.62,8.89,126.411038,67.7418,194.152838,640.2,0.303269,janson,,,,,,,,,,,12.75,,4.125,4.75,3.0,3.5


In [9]:
def eliminar_columnas(df):
    columnas_a_eliminar = [
        'WidthofActualWork(cm)',
        'HeightofWorkinJanson(cm)', 'WidthofWorkinJanson(cm)',
        'LengthofText(cm)', 'WidthofText(cm)',
        'AreaWorkinJanson(cm^2)', 'AreaTextinJanson(cm^2)',
        'TotalSpace(cm^2)', 'PageArea(cm^2)',
        'SpaceRatioPerPage', 'Book', 
        'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24',
        'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
        'Unnamed: 29', 'Unnamed: 30',
        'WidthofActualWork(in)', 'Unnamed: 32',
        'HeightofWorkinJanson(in inches)', 'WidthofWorkinJanson(in)',
        'LengthofText(in)', 'WidthofText(in)', 'NumberofWorldCatPublications','EditionNumber','ArtistUniqueID '
    ]
    
    # Eliminar solo las columnas que existan en el DataFrame
    df_filtrado = df.drop(columns=[col for col in columnas_a_eliminar if col in df.columns])
    return df_filtrado


In [10]:
df = eliminar_columnas(df)

In [11]:
df.columns

Index(['ArtistName', 'ArtistNationality', 'ArtistGender', 'ArtistRace',
       'ArtistEthnicity', 'TitleofWork', 'Year'],
      dtype='object')

In [12]:
# Función para aplicar un EDA básico
def eda_basico(df):
    print("========== RESUMEN GENERAL ==========")
    print("Dimensiones:")
    print(F"Este DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas." , "\n")
    print()
    print()
    print('-----------------------------------------------------')
    print("Información general:")
    display(df.info())
    print()
    print()
    print('-----------------------------------------------------')
    print("Tipos de datos por columna:")
    print(df.dtypes, "\n")
    print()
    print()
    print('-----------------------------------------------------')
    
    print("\n========== DESCRIBES ==========")
    print("Descripción de columnas numéricas:")
    display(df.describe().T)
    print()
    print()
    print('-----------------------------------------------------')

    print("Descripción de columnas categóricas:")
    try:
        display(df.describe(include=['O']))
    except ValueError:
        print("No se existen columnas categóricas en este DataFrame.")
    print()
    print()
    print('-----------------------------------------------------')

    print("\n========== MUESTRAS ==========")
    print("Primeras filas del DataFrame:")
    display(df.head())
    print()
    print()
    print("Últimas filas del DataFrame:")
    display(df.tail(5))
    print()
    print()
    print("Cinco filas aleatorias del DataFrame:")
    display(df.sample(5))
    print()
    print()
    print('-----------------------------------------------------')

    print("\n========== VALUE COUNTS (por columna categórica) ==========")
    col_categoricas =  df.select_dtypes(include=["object", "category"]).columns.tolist()

    if col_categoricas:
        for c in col_categoricas:
            print(f"\n--- {c} ---")
            print(df[c].value_counts)
    else:
        print("No se puede realizar la función .value_counts ya que no existen columnas categóricas.")
    print('-----------------------------------------------------')

   
    print("\n========== DUPLICADOS ==========")
    print("Filas duplicadas:")
    dup_count = df.duplicated().sum()
    print(f"Hay {dup_count} filas duplicadas.")
    if dup_count > 0:
        print("Ejemplo de duplicados:")
        print(df[df.duplicated()].head(), "\n")
    else:
        print("No hay filas duplicadas.\n")
    print('-----------------------------------------------------')


In [13]:
eda_basico(df)

Dimensiones:
Este DataFrame tiene 1634 filas y 7 columnas. 



-----------------------------------------------------
Información general:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634 entries, 0 to 1633
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ArtistName         1634 non-null   object
 1   ArtistNationality  1634 non-null   object
 2   ArtistGender       1634 non-null   object
 3   ArtistRace         1634 non-null   object
 4   ArtistEthnicity    1634 non-null   object
 5   TitleofWork        1634 non-null   object
 6   Year               1634 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 89.5+ KB


None



-----------------------------------------------------
Tipos de datos por columna:
ArtistName           object
ArtistNationality    object
ArtistGender         object
ArtistRace           object
ArtistEthnicity      object
TitleofWork          object
Year                  int64
dtype: object 



-----------------------------------------------------

Descripción de columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,1634.0,1992.302326,14.07248,1963.0,1986.0,1995.0,2001.0,2011.0




-----------------------------------------------------
Descripción de columnas categóricas:


Unnamed: 0,ArtistName,ArtistNationality,ArtistGender,ArtistRace,ArtistEthnicity,TitleofWork
count,1634,1634,1634,1634,1634,1634
unique,245,32,2,4,2,456
top,Pablo Picasso,French,Male,White,Not Hispanic or Latinx,Self-Portrait
freq,68,554,1511,1602,1504,23




-----------------------------------------------------

Primeras filas del DataFrame:


Unnamed: 0,ArtistName,ArtistNationality,ArtistGender,ArtistRace,ArtistEthnicity,TitleofWork,Year
0,A. R. Penck,German,Male,White,Not Hispanic or Latinx,The Demon of Curiosity,1995
1,A. R. Penck,German,Male,White,Not Hispanic or Latinx,The Demon of Curiosity,2001
2,Aaron Siskind,American,Male,White,Not Hispanic or Latinx,New York 2,1986
3,Aaron Siskind,American,Male,White,Not Hispanic or Latinx,New York 2,1991
4,Aaron Siskind,American,Male,White,Not Hispanic or Latinx,New York 2,1995




Últimas filas del DataFrame:


Unnamed: 0,ArtistName,ArtistNationality,ArtistGender,ArtistRace,ArtistEthnicity,TitleofWork,Year
1629,Winslow Homer,American,Male,White,Not Hispanic or Latinx,The Morning Bell,1991
1630,Winslow Homer,American,Male,White,Not Hispanic or Latinx,Snap the Whip,1995
1631,Winslow Homer,American,Male,White,Not Hispanic or Latinx,Snap the Whip,2001
1632,Winslow Homer,American,Male,White,Not Hispanic or Latinx,Snap the Whip,2007
1633,Winslow Homer,American,Male,White,Not Hispanic or Latinx,Snap the Whip,2011




Cinco filas aleatorias del DataFrame:


Unnamed: 0,ArtistName,ArtistNationality,ArtistGender,ArtistRace,ArtistEthnicity,TitleofWork,Year
1093,Marcel Duchamp,French,Male,White,Not Hispanic or Latinx,Tu m',1969
810,James Abbott McNeill Whistler,American,Male,White,Not Hispanic or Latinx,Harmony in Blue and Gold: The Peacock Room,1995
1018,Julia Margaret Cameron,British,Female,White,Not Hispanic or Latinx,Portrait of Ellen Terry,1986
1339,Paul Klee,Swiss-German,Male,White,Not Hispanic or Latinx,Park near Lu,1969
550,Georges Braque,French,Male,White,Not Hispanic or Latinx,The Portuguese,2011




-----------------------------------------------------


--- ArtistName ---
<bound method IndexOpsMixin.value_counts of 0                             A. R. Penck
1                             A. R. Penck
2                           Aaron Siskind
3                           Aaron Siskind
4                           Aaron Siskind
5                           Aaron Siskind
6                         Adolph Gottlieb
7                         Adolph Gottlieb
8              Adolphe William Bouguereau
9              Adolphe William Bouguereau
10                       Albert Bierstadt
11                   Albert Pinkham Ryder
12                   Albert Pinkham Ryder
13                  Albert Renger-Patzsch
14                  Albert Renger-Patzsch
15                  Albert Renger-Patzsch
16                  Albert Renger-Patzsch
17                       Alexander Cozens
18                       Alexander Cozens
19                       Alexander Cozens
20                       Alexander Coze

In [14]:
df= df.rename(columns={
    'ArtistName': 'Artist',
    'TitleofWork': 'Title',
    'ArtistNationality': 'Nationality',
    'ArtistGender': 'Gender',
    'ArtistRace': 'Race',
    'ArtistEthnicity': 'Ethnicity'
})

In [15]:
df.sample()

Unnamed: 0,Artist,Nationality,Gender,Race,Ethnicity,Title,Year
687,Henri Matisse,French,Male,White,Not Hispanic or Latinx,Piano Lesson,2001


In [16]:
df.to_csv('df_jansonr_clean.csv', index=False)