In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.__version__

'2.0.1'

# SERIES

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1]

0.5

In [7]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
data['b']

0.5

In [10]:
data[1]

0.5

In [11]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [12]:
population_dict = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
population['California']

38332521

In [14]:
population[0]

38332521

In [15]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [16]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [17]:
pd.Series(100, 200, 300, index=[5])

TypeError: Series.__init__() got multiple values for argument 'index'

In [18]:
data_dict = {
    2:'a', 
    1:'b', 
    3:'c'
}
pd.Series(data_dict)

2    a
1    b
3    c
dtype: object

In [19]:
serie = pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])
serie

3    c
2    a
dtype: object

In [20]:
serie.index

Index([3, 2], dtype='int64')

# DATA FRAMES

In [21]:
population  # serie

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [22]:
area_dict = {'California': 423967, 
             'Texas': 695662, 
             'New York': 141297,
             'Florida': 170312, 
             'Illinois': 149995
            }
area = pd.Series(area_dict)  # serie
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [23]:
states = pd.DataFrame({'population': population, 'area': area})  # dataframe
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [24]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [25]:
states.columns

Index(['population', 'area'], dtype='object')

In [26]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [27]:
type(states['area'])

pandas.core.series.Series

In [28]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [29]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [30]:
serie1 = {
    'a': 1, 
    'b': 2
}
serie2 = {
    'b': 3, 
    'c': 4
}
pd.DataFrame([serie1, serie2])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [31]:
pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['foo', 'bar'])

Unnamed: 0,foo,bar
a,0.969428,0.036852
b,0.12521,0.458722
c,0.150587,0.976619


# INDICES

In [32]:
idx = pd.Index([2, 3, 5, 7, 11])
idx

Index([2, 3, 5, 7, 11], dtype='int64')

In [33]:
print(idx[0])
print(idx[1])

2
3


In [34]:
# Atributos de un índice
print(idx.size, idx.shape, idx.ndim, idx.dtype)

5 (5,) 1 int64


 La principal diferencia entre los índices de pandas y los arrays de numpy es que 
los índices no pueden modificarse:

 idx[0] = 99  # Devuelve TypeError: Index does not support mutable operations

In [35]:
idx = pd.Index(['a', 'b', 'c'])
df = pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=idx)
df

Unnamed: 0,foo,bar
a,0.189291,0.63107
b,0.372226,0.056212
c,0.925694,0.29469


In [36]:
df.index

Index(['a', 'b', 'c'], dtype='object')

In [38]:
# Un Index actúa como un conjunto (set) de python
idx_a = pd.Index([1, 3, 5, 7, 9])
idx_b = pd.Index([2, 3, 5, 7, 11])

In [39]:
idx_a & idx_b  # intersección

Index([0, 3, 5, 7, 9], dtype='int64')

In [40]:
idx_a | idx_b  # union

Index([3, 3, 5, 7, 11], dtype='int64')

SELECCCIONAR ELEMENTOS

In [41]:
# SERIES

data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [42]:
type(data)

pandas.core.series.Series

In [43]:
data['b']

0.5

In [44]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [45]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [46]:
data['e'] = 1.25  # Añadir un elemento
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [47]:
data['e'] = 2.25  # Modificar un elemento
data

a    0.25
b    0.50
c    0.75
d    1.00
e    2.25
dtype: float64

In [48]:
# slicing por índice explícito
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [49]:
# slicing por índice implítico
data[0:2]

a    0.25
b    0.50
dtype: float64

In [50]:
# masking
data[data >= 1.0]

d    1.00
e    2.25
dtype: float64

In [51]:
mask = (data >= 1.0)
data[mask]

d    1.00
e    2.25
dtype: float64

In [52]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [53]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    2.25
dtype: float64

In [54]:
# fancy indexing
data[['e', 'a']]

e    2.25
a    0.25
dtype: float64

LOC

In [55]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [56]:
# loc utiliza el índice explícito, es decir, el que se ha asignado en el parámetro index
data.loc[1]

'a'

In [57]:
data.loc[1:3]

1    a
3    b
dtype: object

ILOC

In [58]:
# iloc utiliza el índice implítico, es decir, el número de elementos que haya empezando en 0
data.iloc[1]

'b'

In [59]:
data.iloc[1:3]

3    b
5    c
dtype: object

DATAFRAME

In [60]:
area = pd.Series({
    'California': 423967, 
    'Texas': 695662,
    'New York': 141297, 
    'Florida': 170312,
    'Illinois': 149995})
pop = pd.Series({
    'California': 38332521, 
    'Texas': 26448193,
    'New York': 19651127, 
    'Florida': 19552860,
    'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [61]:
data['area']  # Utilizar esta forma en vez de data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [62]:
data['pop']['Texas']

26448193

In [63]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [64]:
df = pd.read_csv("../../data/tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [65]:
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [66]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [67]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [69]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [70]:
df = pd.read_csv ("../../data/tips.csv", dtype={'size': 'Int8'})
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [71]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size             Int8
dtype: object