# Proyecto: Manipulación previa de datos 


### Algunas funciones usadas en este script:
`.nunique()` <- muestra el número de clases en la información de cada columna de un data frame

`.unique()` <- muestra las clases de información en una variable (columna) dada

`.value_counts()` <- Extrae la frecuencia (conteo) de cada clase de datos de una variable (columna) dada

`pd.get_dummies( variable_categorica )` <- Convierte una variable categórica en variable numérica (Ejemplo: Clase A ->[1, 0]; Clase B ->[0, 1])

`pd.to_datetime( variable object )` <- Convierte una variable de tipo objeto (string) al tipo datetime (fecha)

`.rename( columns = {'column_name':'new_column_name'}, inplace = True )` <- Renombra columnas

In [1]:
import pandas as pd

In [2]:
# Cargamos datos:
df_meteorites = pd.read_csv('./db/NASA/Meteorite_Landings.csv')

In [3]:
# Vemos los primeros N registros del dataframe:
df_meteorites.head(10)

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,01/01/1880 12:00:00 AM,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,01/01/1951 12:00:00 AM,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,01/01/1952 12:00:00 AM,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,01/01/1976 12:00:00 AM,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,01/01/1902 12:00:00 AM,-33.16667,-64.95,"(-33.16667, -64.95)"
5,Adhi Kot,379,Valid,EH4,4239.0,Fell,01/01/1919 12:00:00 AM,32.1,71.8,"(32.1, 71.8)"
6,Adzhi-Bogdo (stone),390,Valid,LL3-6,910.0,Fell,01/01/1949 12:00:00 AM,44.83333,95.16667,"(44.83333, 95.16667)"
7,Agen,392,Valid,H5,30000.0,Fell,01/01/1814 12:00:00 AM,44.21667,0.61667,"(44.21667, 0.61667)"
8,Aguada,398,Valid,L6,1620.0,Fell,01/01/1930 12:00:00 AM,-31.6,-65.23333,"(-31.6, -65.23333)"
9,Aguila Blanca,417,Valid,L,1440.0,Fell,01/01/1920 12:00:00 AM,-30.86667,-64.55,"(-30.86667, -64.55)"


In [4]:
df_meteorites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  object 
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ MB


In [5]:
df_meteorites.dtypes

name            object
id               int64
nametype        object
recclass        object
mass (g)       float64
fall            object
year            object
reclat         float64
reclong        float64
GeoLocation     object
dtype: object

In [6]:
# Convierte los tipos de datos a tipos más optimizados para pandas
df_meteorites.convert_dtypes().dtypes

name            string
id               Int64
nametype        string
recclass        string
mass (g)       Float64
fall            string
year            string
reclat         Float64
reclong        Float64
GeoLocation     string
dtype: object

In [7]:
# Extraemos las clases en la información que hay en cada variable (columna):
df_meteorites.nunique()

name           45716
id             45716
nametype           2
recclass         466
mass (g)       12576
fall               2
year             266
reclat         12738
reclong        14640
GeoLocation    17100
dtype: int64

In [8]:
# Extraemos las calses de información en una columna dada
df_meteorites['nametype'].unique()

array(['Valid', 'Relict'], dtype=object)

In [9]:
# Extrae la frecuencia (conteo) de cada clase de datos de una variable (columna) dada
df_meteorites[ 'nametype'].value_counts()

Valid     45641
Relict       75
Name: nametype, dtype: int64

In [10]:
# Visualizamos la variables de 2 clases de datos:
df_meteorites[ ['nametype' , 'fall' ] ]

Unnamed: 0,nametype,fall
0,Valid,Fell
1,Valid,Fell
2,Valid,Fell
3,Valid,Fell
4,Valid,Fell
...,...,...
45711,Valid,Found
45712,Valid,Found
45713,Valid,Found
45714,Valid,Found


In [11]:
# Convertimos las variables categóricas en variables numércias:
pd.get_dummies( df_meteorites['fall'] )

Unnamed: 0,Fell,Found
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
45711,0,1
45712,0,1
45713,0,1
45714,0,1


In [12]:
# Agregamos las nuevas columnas 'fell' y 'found' al dataframe 'df_meteorites'
df_meteorites[ ['fell','found'] ] = pd.get_dummies( df_meteorites['fall'] )

# Vemos los primeros N registrso del dataframe actualizado:
df_meteorites.head(10)

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,01/01/1880 12:00:00 AM,50.775,6.08333,"(50.775, 6.08333)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,01/01/1951 12:00:00 AM,56.18333,10.23333,"(56.18333, 10.23333)",1,0
2,Abee,6,Valid,EH4,107000.0,Fell,01/01/1952 12:00:00 AM,54.21667,-113.0,"(54.21667, -113.0)",1,0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,01/01/1976 12:00:00 AM,16.88333,-99.9,"(16.88333, -99.9)",1,0
4,Achiras,370,Valid,L6,780.0,Fell,01/01/1902 12:00:00 AM,-33.16667,-64.95,"(-33.16667, -64.95)",1,0
5,Adhi Kot,379,Valid,EH4,4239.0,Fell,01/01/1919 12:00:00 AM,32.1,71.8,"(32.1, 71.8)",1,0
6,Adzhi-Bogdo (stone),390,Valid,LL3-6,910.0,Fell,01/01/1949 12:00:00 AM,44.83333,95.16667,"(44.83333, 95.16667)",1,0
7,Agen,392,Valid,H5,30000.0,Fell,01/01/1814 12:00:00 AM,44.21667,0.61667,"(44.21667, 0.61667)",1,0
8,Aguada,398,Valid,L6,1620.0,Fell,01/01/1930 12:00:00 AM,-31.6,-65.23333,"(-31.6, -65.23333)",1,0
9,Aguila Blanca,417,Valid,L,1440.0,Fell,01/01/1920 12:00:00 AM,-30.86667,-64.55,"(-30.86667, -64.55)",1,0


In [13]:
# Convertimos las variables anteriores al tipo 'category'
df_meteorites[ ['nametype' , 'fall' ] ] = df_meteorites[ ['nametype' , 'fall' ] ].astype('category')

In [14]:
# Verificamos que se hayan guardado los cambios
df_meteorites.dtypes

name             object
id                int64
nametype       category
recclass         object
mass (g)        float64
fall           category
year             object
reclat          float64
reclong         float64
GeoLocation      object
fell              uint8
found             uint8
dtype: object

In [15]:
df_meteorites['year']

0        01/01/1880 12:00:00 AM
1        01/01/1951 12:00:00 AM
2        01/01/1952 12:00:00 AM
3        01/01/1976 12:00:00 AM
4        01/01/1902 12:00:00 AM
                  ...          
45711    01/01/1990 12:00:00 AM
45712    01/01/1999 12:00:00 AM
45713    01/01/1939 12:00:00 AM
45714    01/01/2003 12:00:00 AM
45715    01/01/1976 12:00:00 AM
Name: year, Length: 45716, dtype: object

In [16]:
# Los valores de la variable 'year' inicialmente son de tipo 'object' (texto)
# así que los convertimos al tipo 'datetime'
pd.to_datetime(
    df_meteorites['year'],
    errors = 'coerce',
    format ='%m/%d/%Y %H:%M:%S %p'
    )

0       1880-01-01 12:00:00
1       1951-01-01 12:00:00
2       1952-01-01 12:00:00
3       1976-01-01 12:00:00
4       1902-01-01 12:00:00
                ...        
45711   1990-01-01 12:00:00
45712   1999-01-01 12:00:00
45713   1939-01-01 12:00:00
45714   2003-01-01 12:00:00
45715   1976-01-01 12:00:00
Name: year, Length: 45716, dtype: datetime64[ns]

In [17]:
df_meteorites['year'] = pd.to_datetime(
    df_meteorites['year'],
    errors = 'coerce',
    format ='%m/%d/%Y %H:%M:%S %p'
    )

# Verificamos que se hayan guardado los cambios
df_meteorites.dtypes

name                   object
id                      int64
nametype             category
recclass               object
mass (g)              float64
fall                 category
year           datetime64[ns]
reclat                float64
reclong               float64
GeoLocation            object
fell                    uint8
found                   uint8
dtype: object

In [18]:
# Cambiamos el nombre de la columna 'mass (g)' como 'mass':
df_meteorites.rename( columns={'mass (g)':'mass'} , inplace = True ) 

In [19]:
# Verificamos que se hayan guardado los cambios
df_meteorites.dtypes

name                   object
id                      int64
nametype             category
recclass               object
mass                  float64
fall                 category
year           datetime64[ns]
reclat                float64
reclong               float64
GeoLocation            object
fell                    uint8
found                   uint8
dtype: object