In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [11]:
%cd /gdrive/My\ Drive/Colab\ Notebooks/db

/gdrive/My Drive/Colab Notebooks/db


In [0]:
!wget https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD
!mv 'rows.csv?accessType=DOWNLOAD' meteorite.csv

In [0]:
import pandas as pd
import numpy as np

In [25]:
df_meteorites = pd.read_csv('meteorite.csv')
df_meteorites.sample(5)

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
6783,Daraj 112,6571,Valid,L5/6,215.0,Found,01/01/1986 12:00:00 AM,29.66472,11.77639,"(29.66472, 11.77639)"
11932,Frontier Mountain 01169,10351,Valid,H6,6.2,Found,01/01/2001 12:00:00 AM,-72.95278,160.52222,"(-72.95278, 160.52222)"
34434,Queen Alexandra Range 97800,21255,Valid,H6,126.5,Found,01/01/1997 12:00:00 AM,-84.0,168.0,"(-84.0, 168.0)"
19172,Larkman Nunatak 06327,49591,Valid,LL6,286.1,Found,01/01/2006 12:00:00 AM,0.0,0.0,"(0.0, 0.0)"
35504,Queen Alexandra Range 99877,22321,Valid,LL5,16.8,Found,01/01/1999 12:00:00 AM,-84.0,168.0,"(-84.0, 168.0)"


## Data Loading
---

In [26]:
df_meteorites.shape

(45716, 10)

In [27]:
df_meteorites.describe()

Unnamed: 0,id,mass (g),reclat,reclong
count,45716.0,45585.0,38401.0,38401.0
mean,26889.735104,13278.08,-39.12258,61.074319
std,16860.68303,574988.9,46.378511,80.647298
min,1.0,0.0,-87.36667,-165.43333
25%,12688.75,7.2,-76.71424,0.0
50%,24261.5,32.6,-71.5,35.66667
75%,40656.75,202.6,0.0,157.16667
max,57458.0,60000000.0,81.16667,354.47333


In [28]:
df_meteorites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  object 
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ MB


In [29]:
df_meteorites.dtypes

name            object
id               int64
nametype        object
recclass        object
mass (g)       float64
fall            object
year            object
reclat         float64
reclong        float64
GeoLocation     object
dtype: object

In [30]:
# Each column now uses the best dtype for its data
df_meteorites.convert_dtypes().dtypes

name            string
id               Int64
nametype        string
recclass        string
mass (g)       float64
fall            string
year            string
reclat         float64
reclong        float64
GeoLocation     string
dtype: object

## Pandas utilities
---

### Categories

In [0]:
df_meteorites[['nametype', 'fall']] = df_meteorites[['nametype', 'fall']].astype('category')

In [35]:
df_meteorites.dtypes

name             object
id                int64
nametype       category
recclass         object
mass (g)        float64
fall           category
year             object
reclat          float64
reclong         float64
GeoLocation      object
dtype: object

In [36]:
df_meteorites['fall'].unique()

[Fell, Found]
Categories (2, object): [Fell, Found]

In [37]:
df_meteorites['fall'].value_counts()

Found    44609
Fell      1107
Name: fall, dtype: int64

### get_dummies

In [0]:
df_meteorites[['fell', 'found']] = pd.get_dummies(df_meteorites['fall'])

In [44]:
df_meteorites.sample()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,fell,found
21675,Lookout Hill,14697,Valid,CM2,16.6,Found,01/01/1976 12:00:00 AM,-30.09056,128.80583,"(-30.09056, 128.80583)",0,1


### DateTimes

In [0]:
df_meteorites['year'] = pd.to_datetime(
    df_meteorites['year'], 
    errors='coerce', 
    format='%m/%d/%Y %H:%M:%S %p'
  )

In [47]:
df_meteorites.dtypes

name                   object
id                      int64
nametype             category
recclass               object
mass (g)              float64
fall                 category
year           datetime64[ns]
reclat                float64
reclong               float64
GeoLocation            object
fell                    uint8
found                   uint8
dtype: object

### Rename

In [0]:
df_meteorites.rename({'mass (g)': 'mass'}, axis='columns', inplace=True)

In [58]:
df_meteorites.columns

Index(['name', 'id', 'nametype', 'recclass', 'mass', 'fall', 'year', 'reclat',
       'reclong', 'GeoLocation', 'fell', 'found'],
      dtype='object')

## Borrar Datos
---

### Drop

In [61]:
# List of rows/columns names and axis
df_meteorites['ones'] = 1
df_meteorites.drop(['ones'], axis=1, inplace=True)
df_meteorites.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01 12:00:00,50.775,6.08333,"(50.775, 6.08333)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01 12:00:00,56.18333,10.23333,"(56.18333, 10.23333)",1,0
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01 12:00:00,54.21667,-113.0,"(54.21667, -113.0)",1,0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01 12:00:00,16.88333,-99.9,"(16.88333, -99.9)",1,0
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01 12:00:00,-33.16667,-64.95,"(-33.16667, -64.95)",1,0


In [0]:
# List of rows or/and columns
df_meteorites.drop(columns=['id', 'recclass'], index=[0, 2, 4, 6], inplace=True)

### Copy

In [0]:
df_meteorites_copy = df_meteorites.copy()