## **Librerias**

In [1]:
import pandas as pd

## **Datos**

In [2]:
tax_data = pd.read_csv('https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv')

In [3]:
tax_data.shape

(1476, 147)

### **1. Limitando el número de columnas**

In [4]:
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']
col_numbers = [0, 1, 2, 3, 4]

In [5]:
tax_data_v1 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    usecols=col_names
)

In [6]:
tax_data_v2 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    usecols=col_numbers
)

In [7]:
tax_data_v1.equals(tax_data_v2)

True

### **2. Limitando el número de filas**

In [8]:
tax_data_v3 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    nrows=100
)

In [9]:
tax_data_v3.shape

(100, 147)

In [10]:
tax_data_v4 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    nrows=100,
    skiprows=50,
    header=None
)

tax_data_v4.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,137,138,139,140,141,142,143,144,145,146
0,50,VT,5037,2,100,60,20,40,50,160,...,90,336,0,0,0,0,0,0,80,151
1,50,VT,5037,3,70,30,30,0,30,120,...,60,349,0,0,0,0,20,53,50,137


### **3. Ajustando el encabezado de la tabla**

In [11]:
col_names = list(tax_data)

tax_data_v5 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    nrows=100,
    skiprows=50,
    header=None,
    names=col_names
)

tax_data_v5.head(2)

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1,mars1,MARS2,MARS4,PREP,N2,...,N10300,A10300,N85530,A85530,N85300,A85300,N11901,A11901,N11902,A11902
0,50,VT,5037,2,100,60,20,40,50,160,...,90,336,0,0,0,0,0,0,80,151
1,50,VT,5037,3,70,30,30,0,30,120,...,60,349,0,0,0,0,20,53,50,137


### **4. Ajustando el tipo de los datos**

In [14]:
tax_data_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1476 entries, 0 to 1475
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   STATEFIPS  1476 non-null   int64 
 1   STATE      1476 non-null   object
 2   zipcode    1476 non-null   int64 
 3   agi_stub   1476 non-null   int64 
 4   N1         1476 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 57.8+ KB


In [17]:
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']

tax_data_v6 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    usecols=col_names,
    dtype={
        'zipcode': str
    }
)

In [18]:
tax_data_v6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1476 entries, 0 to 1475
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   STATEFIPS  1476 non-null   int64 
 1   STATE      1476 non-null   object
 2   zipcode    1476 non-null   object
 3   agi_stub   1476 non-null   int64 
 4   N1         1476 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 57.8+ KB


### **5. Personalizar los valores faltantes**

In [26]:
tax_data_v6[tax_data_v6['zipcode'].isna()]

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1


In [20]:
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']

tax_data_v7 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    usecols = col_names,
    na_values={
        'zipcode':0
    }
)

In [27]:
tax_data_v7[tax_data_v7['zipcode'].isna()]

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1
0,50,VT,,1,111580
1,50,VT,,2,82760
2,50,VT,,3,46270
3,50,VT,,4,30070
4,50,VT,,5,39530
5,50,VT,,6,9620


### **6. Manejando registros con errores**

In [36]:
tax_data_v8 = pd.read_csv(
    'https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv',
    on_bad_lines='warn'
    # Puede tomar los valores skip, warn y error     
)