7.14 Kim Leach - Intro to data science: pandas series and dataframes
    * A series is an enhanced on-dimensional array that support custom indexing including non-integer indices like strings
    * Series offer additional capabilities that make them more voncenient 
    * Series may have missing data and many operations ignore missing data by default

In [1]:
import pandas as pd

In [2]:
grades = pd.Series([87, 100, 94])

In [3]:
grades

0     87
1    100
2     94
dtype: int64

In [4]:
pd.Series(98.6, range(3))

0    98.6
1    98.6
2    98.6
dtype: float64

In [5]:
grades[0]

87

In [8]:
grades.count()

3

In [9]:
grades.mean()

93.66666666666667

In [10]:
grades.min()

87

In [11]:
grades.max()

100

In [12]:
grades.std()

6.506407098647712

In [15]:
grades.describe() # 50% median or sorted values, 25% median of first half of sorted values, 75% median of second half of sorted values

count      3.000000
mean      93.666667
std        6.506407
min       87.000000
25%       90.500000
50%       94.000000
75%       97.000000
max      100.000000
dtype: float64

In [None]:
#creating a series with custom indices

In [16]:
grades = pd.Series([87, 100, 94], index=['Wally', 'Eva', 'Sam'])

In [35]:
grades #Kim Leach

Wally     87
Eva      100
Sam       94
dtype: int64

In [18]:
grades = pd.Series({'Wally': 87, 'Eva': 100, 'Sam': 94})

In [19]:
grades

Wally     87
Eva      100
Sam       94
dtype: int64

In [36]:
# Accessing elements of a series via custom indices - Kim Leach

grades['Eva']

100

In [37]:
grades.Wally #Kim Leach

87

In [22]:
grades.dtype

dtype('int64')

In [23]:
grades.values

array([ 87, 100,  94], dtype=int64)

In [24]:
# creating a series of strings
hardware = pd.Series(['Hammer', 'Saw', 'Wrench'])
hardware

0    Hammer
1       Saw
2    Wrench
dtype: object

In [25]:
hardware.str.contains('a')

0     True
1     True
2    False
dtype: bool

In [26]:
hardware.str.upper()

0    HAMMER
1       SAW
2    WRENCH
dtype: object

In [28]:
import numpy as np

temps = np.random.randint(60, 101, 6)

temperatures = pd.Series(temps)

temperatures

0    88
1    86
2    71
3    73
4    82
5    91
dtype: int32

In [30]:
temperatures.min()

71

In [32]:
temperatures.max()

91

In [33]:
temperatures.mean()

81.83333333333333

In [34]:
temperatures.describe()

count     6.000000
mean     81.833333
std       8.183316
min      71.000000
25%      75.250000
50%      84.000000
75%      87.500000
max      91.000000
dtype: float64

7.14.2 DataFrames: an enhanced two-dimensional array
    * can have custom row and column indices
    * offers additional operations and capabilities that make them more convenient
    * each column is a series, each series may contain different element types

In [38]:
import pandas as pd

In [40]:
grades_dict = {'Wally': [87, 96, 70], 'Eva': [100, 87, 90], 'Sam': [94, 77, 90], 'KimLeach': [100, 81, 82], 'Bob': [83, 65, 85]}

In [41]:
grades = pd.DataFrame(grades_dict)
grades

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
0,87,100,94,100,83
1,96,87,77,81,65
2,70,90,90,82,85


In [42]:
# customizing a dataframe's indices with the index attribute
pd.DataFrame(grades_dict, index=['Test1', 'Test2', 'Test3'])

grades.index = ['Test1', 'Test2', 'Test3']

grades

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [43]:
grades['Eva']

Test1    100
Test2     87
Test3     90
Name: Eva, dtype: int64

In [44]:
grades.KimLeach

Test1    100
Test2     81
Test3     82
Name: KimLeach, dtype: int64

In [45]:
#selecting rows via the loc and iloc attributes
#access a row by its label via the loc attribute

grades.loc['Test1']

Wally        87
Eva         100
Sam          94
KimLeach    100
Bob          83
Name: Test1, dtype: int64

In [46]:
#access rows by integer zero-based indices using the iloc attribute
grades.iloc[1]

Wally       96
Eva         87
Sam         77
KimLeach    81
Bob         65
Name: Test2, dtype: int64

In [47]:
#slecting rows via slices and lists with the loc and iloc attributes

grades.loc['Test1':'Test3']

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [48]:
grades.iloc[0:2]

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65


In [49]:
grades.loc[['Test1', 'Test3']]

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87,100,94,100,83
Test3,70,90,90,82,85


In [50]:
grades.iloc[[0, 2]]

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87,100,94,100,83
Test3,70,90,90,82,85


In [51]:
#selecting subsets of the rows and columns
grades.loc['Test1':'Test2', ['Eva', 'KimLeach']]

Unnamed: 0,Eva,KimLeach
Test1,100,100
Test2,87,81


In [52]:
grades.iloc[[0, 2], 0:3]

Unnamed: 0,Wally,Eva,Sam
Test1,87,100,94
Test3,70,90,90


In [53]:
#boolean indexing

grades[grades >= 90]

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,,100.0,94.0,100.0,
Test2,96.0,,,,
Test3,,90.0,90.0,,


In [54]:
grades[(grades >= 80) & (grades < 90)]

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test1,87.0,,,,83.0
Test2,,87.0,,81.0,
Test3,,,,82.0,85.0


In [56]:
#accessing a specific dataframe cell by ro and column

grades.at['Test2', 'Eva']

87

In [57]:
grades.iat[2, 0]

70

In [64]:
grades.at['Test2', 'Eva'] = 100

In [65]:
grades.at['Test2', 'Eva']

100

In [70]:
grades.iat[1, 2] = 87

In [71]:
grades.iat[1, 2]

87

In [72]:
# descriptive statistics
grades.describe()

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.333333,96.666667,90.333333,87.666667,77.666667
std,13.203535,5.773503,3.511885,10.692677,11.015141
min,70.0,90.0,87.0,81.0,65.0
25%,78.5,95.0,88.5,81.5,74.0
50%,87.0,100.0,90.0,82.0,83.0
75%,91.5,100.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [78]:
pd.set_option('display.precision', 2) #without display there was an optionerror, googled and found the solution of adding display.precision

In [77]:
grades.describe()

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.33,96.67,90.33,87.67,77.67
std,13.2,5.77,3.51,10.69,11.02
min,70.0,90.0,87.0,81.0,65.0
25%,78.5,95.0,88.5,81.5,74.0
50%,87.0,100.0,90.0,82.0,83.0
75%,91.5,100.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [79]:
grades.mean()

Wally       84.33
Eva         96.67
Sam         90.33
KimLeach    87.67
Bob         77.67
dtype: float64

In [80]:
#transposing the dataframe with the T attribute
grades.T

Unnamed: 0,Test1,Test2,Test3
Wally,87,96,70
Eva,100,100,90
Sam,94,87,90
KimLeach,100,81,82
Bob,83,65,85


In [81]:
grades.T.describe()

Unnamed: 0,Test1,Test2,Test3
count,5.0,5.0,5.0
mean,92.8,85.8,83.4
std,7.66,13.81,8.23
min,83.0,65.0,70.0
25%,87.0,81.0,82.0
50%,94.0,87.0,85.0
75%,100.0,96.0,90.0
max,100.0,100.0,90.0


In [82]:
grades.T.mean()

Test1    92.8
Test2    85.8
Test3    83.4
dtype: float64

In [83]:
#sorting rows by their indices
grades.sort_index(ascending=False)

Unnamed: 0,Wally,Eva,Sam,KimLeach,Bob
Test3,70,90,90,82,85
Test2,96,100,87,81,65
Test1,87,100,94,100,83


In [84]:
#sorting by column indices
grades.sort_index(axis=1)

Unnamed: 0,Bob,Eva,KimLeach,Sam,Wally
Test1,83,100,100,94,87
Test2,65,100,81,87,96
Test3,85,90,82,90,70


In [85]:
#sorting by column values
grades.sort_values(by='Test1', axis=1, ascending=False)

Unnamed: 0,Eva,KimLeach,Sam,Wally,Bob
Test1,100,100,94,87,83
Test2,100,81,87,96,65
Test3,90,82,90,70,85


In [86]:
grades.T.sort_values(by='Test1', ascending=False)

Unnamed: 0,Test1,Test2,Test3
Eva,100,100,90
KimLeach,100,81,82
Sam,94,87,90
Wally,87,96,70
Bob,83,65,85


In [87]:
grades.loc['Test1'].sort_values(ascending=False)

Eva         100
KimLeach    100
Sam          94
Wally        87
Bob          83
Name: Test1, dtype: int64

In [90]:
temps = {'Mon': [68, 89], 'Tue': [71, 93], 'Wed': [66, 82], 'Thu': [75, 97], 'Fri': [62, 79]}

In [93]:
temperatures = pd.DataFrame(temps, index=['Low', 'High'])

In [94]:
temperatures

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Low,68,71,66,75,62
High,89,93,82,97,79


In [95]:
temperatures.loc[:, 'Mon':'Wed']

Unnamed: 0,Mon,Tue,Wed
Low,68,71,66
High,89,93,82


In [96]:
temperatures.loc['Low']

Mon    68
Tue    71
Wed    66
Thu    75
Fri    62
Name: Low, dtype: int64

In [97]:
pd.set_option('display.precision', 2)
temperatures.mean()

Mon    78.5
Tue    82.0
Wed    74.0
Thu    86.0
Fri    70.5
dtype: float64

In [98]:
temperatures.mean(axis=1)

Low     68.4
High    88.0
dtype: float64