In [1]:
import pandas as pd
import numpy as np

In [2]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'],
          'Born': ['1920-07-25', '1876-06-13'],
          'Died': ['1958-04-16', '1937-10-16'],
          'Age': [37, 61]},
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age'])
print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


In [3]:
firstRow = scientists.loc['Rosaline Franklin']
print(firstRow, end='\n\n') # First column: Index; second column: values.
print(type(firstRow)) # Series dtype

Occupation       Chemist
Born          1920-07-25
Died          1958-04-16
Age                   37
Name: Rosaline Franklin, dtype: object

<class 'pandas.core.series.Series'>


In [4]:
# index is a series attribute
print("Index \t", firstRow.index, end='\n\n')
# values is a series attribute, as well.
print("Values \t", firstRow.values)

Index 	 Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

Values 	 ['Chemist' '1920-07-25' '1958-04-16' 37]


In [5]:
# attributes can be accessed using slicing syntax.
print(firstRow.index[0])

Occupation


In [6]:
print(firstRow.values[0])

Chemist


In [7]:
# Example of a Series Method
print(firstRow.keys())

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [8]:
# Use slicing on the keys method call to retrieve the desired value.
print(firstRow.index[0], end='\n\n')
print(firstRow.keys()[0])

Occupation

Occupation


In [9]:
# Series attributes: loc
print("loc attribute - 'Occupation':", firstRow.loc["Occupation"])

loc attribute - 'Occupation': Chemist


In [10]:
# Attributes: iloc, T.
print("iloc attribute - '0':", firstRow.iloc[0])
print("T attribute:\n", firstRow.T)
# There are many more but I won't list them.

iloc attribute - '0': Chemist
T attribute:
 Occupation       Chemist
Born          1920-07-25
Died          1958-04-16
Age                   37
Name: Rosaline Franklin, dtype: object


In [11]:
# New data set
scientists = pd.read_csv('C:/Data/scientists.csv')

In [12]:
# Basic descriptive statistics
ages = scientists['Age']
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [13]:
print(ages.describe())

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64


In [14]:
# I believe I tried to do this on a DataFrame once but it didn't work. I ended doing it this way. 
## I think R is more lenient when it comes to performing a function on the entire DF object.
print(ages.mean())

59.125


In [15]:
print(ages[ages > ages.mean()]) # here we go, now we're getting into what I like. Filter clause in the subsetting brankets.

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [16]:
print(ages > ages.mean()) # Boolean value, which values are TRUE for the statement ages greater than then mean. 
                          ## Compare to the series above. 4 values are TRUE.

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


In [17]:
manualBoolValues = [True, True, False, False, True, True, False, True]
print(ages[manualBoolValues]) # What's happening with all this is that 
                              ## whatever is returning as the Boolean True is being retirved

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


In [18]:
# Vector arithmetic, base python normally needs a loop to do this.
print(ages + ages) # addition

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [19]:
print(ages * ages) # multiplication

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [20]:
print(ages + 100) # 1 value (scalar value)
# 100 is recycled to each value in the series (or Vector)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


In [21]:
print( ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [22]:
# Vectors of varied length, broadcating is the term.
print(ages + pd.Series([100, 99])) # Series returns NaN on missing values.

0    137.0
1    160.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [23]:
print(ages + np.array([100, 200])) # error with the numpy array object

ValueError: operands could not be broadcast together with shapes (8,) (2,) 

In [24]:
# Vectors and common index
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [25]:
# sorting with the sort_index method
revAges = ages.sort_index(ascending=False)
print(revAges)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [26]:
print(ages + revAges) # Vectors are aligned with indexes meaning that although revAges is reveresed it is based on the index
# so arithmetic is based on index, not the sort order. "Automatically Aligned"

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64
