# Pandas data structures

In [1]:
import pandas as pd

## Creating your own data

### Creating a Series

In [2]:
s = pd.Series(['banana', 42])
print(s)

0    banana
1        42
dtype: object


In [3]:
s = pd.Series(['Wes McKinney', 'Creator of Pandas'],
index=['Person', 'Who'])
print(s)

Person         Wes McKinney
Who       Creator of Pandas
dtype: object


### Creating a DataFrame

In [4]:
scientists = pd.DataFrame({
    'Name': ['Rosaline Franklin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'Age': [37, 61]
})
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [5]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'],
          'Born': ['1920-07-25', '1876-06-13'],
          'Died': ['1958-04-16', '1937-10-16'], 'Age': [37, 61]
        },
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age'])
print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


## The Series

In [6]:
first_row = scientists.loc['William Gosset']
print(type(first_row))

<class 'pandas.core.series.Series'>


In [7]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [8]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [9]:
print(first_row.values)

['Statistician' '1876-06-13' '1937-10-16' 61]


In [10]:
print(first_row.index[0])

Occupation


### The Series is ndarray-like

#### series methods

In [11]:
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [12]:
print(ages.mean())
print(ages.min())
print(ages.max())
print(ages.std())

49.0
37
61
16.97056274847714


### Boolean subsetting Series

In [13]:
print(ages.describe())

count     2.000000
mean     49.000000
std      16.970563
min      37.000000
25%      43.000000
50%      49.000000
75%      55.000000
max      61.000000
Name: Age, dtype: float64


In [14]:
print(ages[ages > ages.mean()])

William Gosset    61
Name: Age, dtype: int64


In [15]:
print(ages > ages.mean())

Rosaline Franklin    False
William Gosset        True
Name: Age, dtype: bool


In [16]:
print(type(ages > ages.mean()))

<class 'pandas.core.series.Series'>


In [17]:
manual_bool_values = [True, True]
print(ages[manual_bool_values])

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


### Operations are vectorized

#### Vectors of same length

In [18]:
print(ages + ages)

Rosaline Franklin     74
William Gosset       122
Name: Age, dtype: int64


#### Vectors with integers (scalars)

In [19]:
print(ages + 100)

Rosaline Franklin    137
William Gosset       161
Name: Age, dtype: int64


In [20]:
print(ages * 2)

Rosaline Franklin     74
William Gosset       122
Name: Age, dtype: int64


#### Vectors with different lengths

In [21]:
print(ages + pd.Series([1, 100]))

Rosaline Franklin   NaN
William Gosset      NaN
0                   NaN
1                   NaN
dtype: float64


#### Vectors with common index labels

In [22]:
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [23]:
rev_ages = ages.sort_index(ascending=False)
print(rev_ages)

William Gosset       61
Rosaline Franklin    37
Name: Age, dtype: int64


In [24]:
print(ages * 2)

Rosaline Franklin     74
William Gosset       122
Name: Age, dtype: int64


In [25]:
print(ages + rev_ages)

Rosaline Franklin     74
William Gosset       122
Name: Age, dtype: int64


## The DataFrame

### Boolean subsetting DataFrame

In [26]:
print(scientists[scientists['Age'] > scientists['Age'].mean()])

                  Occupation        Born        Died  Age
William Gosset  Statistician  1876-06-13  1937-10-16   61


### Operations are automatically aligned and vectorized

## Making changes to Series and DataFrames

### Add additional columns