In [85]:
import numpy as np
import pandas as pd

In [86]:
pd.Series([1, 2, 3], index=['a', 'b', 'c']) # with index

a    1
b    2
c    3
dtype: int64

In [87]:
pd.Series(np.array([1, 2, 3]), index=['a', 'b', 'c']) # from a 1darray


a    1
b    2
c    3
dtype: int32

In [88]:
pd.Series({'a': 1, 'b': 2, 'c':3}) # from a dict


a    1
b    2
c    3
dtype: int64

In [89]:
series = pd.Series({'a': 1, 'b': 2, 'c':3})
series['a']


1

In [90]:
wine_dict = \
{
    'red_wine': [3, 6, 5],
    'white_wine':[5, 0, 10]
}
sales = pd.DataFrame(wine_dict, index=["adam", "bob", "charles"])
sales['white_wine']


adam        5
bob         0
charles    10
Name: white_wine, dtype: int64

In [91]:
sales

Unnamed: 0,red_wine,white_wine
adam,3,5
bob,6,0
charles,5,10


In [92]:
presidents_df = pd.read_csv('https://sololearn.com/uploads/files/president_heights_party.csv', index_col='name')
                                  
presidents_df.head(7)

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
George Washington,1,57,189,none
John Adams,2,61,170,federalist
Thomas Jefferson,3,57,189,democratic-republican
James Madison,4,57,163,democratic-republican
James Monroe,5,58,183,democratic-republican
John Quincy Adams,6,57,171,democratic-republican
Andrew Jackson,7,61,185,democratic


In [93]:
print(presidents_df.shape)
#There are 45 rows and 4 columns in this DataFrame

print(presidents_df.size)


(45, 4)
180


In [94]:
presidents_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, George Washington to Donald J. Trump
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   order   45 non-null     int64 
 1   age     45 non-null     int64 
 2   height  45 non-null     int64 
 3   party   45 non-null     object
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


In [95]:
print(type(presidents_df.loc['Abraham Lincoln']))
print(presidents_df.loc['Abraham Lincoln'].shape)

<class 'pandas.core.series.Series'>
(4,)


In [96]:
presidents_df.loc['Abraham Lincoln':'Ulysses S. Grant']
#.loc[ ] allows us to select data by label or by a conditional statement.

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abraham Lincoln,16,52,193,republican
Andrew Johnson,17,56,178,national union
Ulysses S. Grant,18,46,173,republican


In [97]:
presidents_df.iloc[15:18]
#Both .loc[ ] and .iloc[ ] may be used with a boolean array to subset the data.

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abraham Lincoln,16,52,193,republican
Andrew Johnson,17,56,178,national union
Ulysses S. Grant,18,46,173,republican


In [98]:
presidents_df.columns

Index(['order', 'age', 'height', 'party'], dtype='object')

In [99]:
print(type(presidents_df.columns))

<class 'pandas.core.indexes.base.Index'>


In [100]:
print(presidents_df['height'])
print(presidents_df['height'].shape)

name
George Washington         189
John Adams                170
Thomas Jefferson          189
James Madison             163
James Monroe              183
John Quincy Adams         171
Andrew Jackson            185
Martin Van Buren          168
William Henry Harrison    173
John Tyler                183
James K. Polk             173
Zachary Taylor            173
Millard Fillmore          175
Franklin Pierce           178
James Buchanan            183
Abraham Lincoln           193
Andrew Johnson            178
Ulysses S. Grant          173
Rutherford B. Hayes       174
James A. Garfield         183
Chester A. Arthur         183
Grover Cleveland          180
Benjamin Harrison         168
Grover Cleveland          180
William McKinley          170
Theodore Roosevelt        178
William Howard Taft       182
Woodrow Wilson            180
Warren G. Harding         183
Calvin Coolidge           178
Herbert Hoover            182
Franklin D. Roosevelt     188
Harry S. Truman           175
Dwigh

In [101]:
#To select multiple columns, we pass the names in a list, resulting in a DataFrame. Remember, we can use .head() to access the first 3 rows as shown below:
print(presidents_df[['height','age']].head(n=3))

                   height  age
name                          
George Washington     189   57
John Adams            170   61
Thomas Jefferson      189   57


In [102]:
#When accessing a single column, one bracket results in a Series (single dimension) and double brackets results in a DataFrame (multi dimensional).

presidents_df['height']

name
George Washington         189
John Adams                170
Thomas Jefferson          189
James Madison             163
James Monroe              183
John Quincy Adams         171
Andrew Jackson            185
Martin Van Buren          168
William Henry Harrison    173
John Tyler                183
James K. Polk             173
Zachary Taylor            173
Millard Fillmore          175
Franklin Pierce           178
James Buchanan            183
Abraham Lincoln           193
Andrew Johnson            178
Ulysses S. Grant          173
Rutherford B. Hayes       174
James A. Garfield         183
Chester A. Arthur         183
Grover Cleveland          180
Benjamin Harrison         168
Grover Cleveland          180
William McKinley          170
Theodore Roosevelt        178
William Howard Taft       182
Woodrow Wilson            180
Warren G. Harding         183
Calvin Coolidge           178
Herbert Hoover            182
Franklin D. Roosevelt     188
Harry S. Truman           175
Dwigh

In [103]:
presidents_df['height'][1]

170

In [104]:
presidents_df.head()

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
George Washington,1,57,189,none
John Adams,2,61,170,federalist
Thomas Jefferson,3,57,189,democratic-republican
James Madison,4,57,163,democratic-republican
James Monroe,5,58,183,democratic-republican


In [105]:
"""
if you try running this :
"""
#presidents_df.loc['height']
#you will get an error , as 'height' is not the index of the data frame
#you should use the index , which is the name of the presidents

'\nif you try running this :\n'

In [106]:
presidents_df.loc[:, 'order':'height'].head(n=3)#select all presidents / select columns from order to height / select the first three results

Unnamed: 0_level_0,order,age,height
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
George Washington,1,57,189
John Adams,2,61,170
Thomas Jefferson,3,57,189


In [107]:
presidents_df.mean()#calculating the mean of every (valid) column 

  presidents_df.mean()#calculating the mean of every (valid) column


order      23.022222
age        55.000000
height    180.000000
dtype: float64

In [108]:
presidents_df.max()

order       45
age         70
height     193
party     whig
dtype: object

In [109]:
presidents_df.min()

order              1
age               42
height           163
party     democratic
dtype: object

In [110]:
print(presidents_df['age'].quantile([0.25, 0.5, 0.75, 1]))

0.25    51.0
0.50    55.0
0.75    58.0
1.00    70.0
Name: age, dtype: float64


In [111]:
print(presidents_df['age'].mean())
print(presidents_df['age'].median())
print(presidents_df['age'].quantile(0.5))

55.0
55.0
55.0


In [112]:
"""
Standard deviation (std) is the square root of variance. 
A high std implies a large spread, 
and a low std indicates a small spread, or most points are close to the mean.
"""

#In one extreme example, the data consists of all constant 2, there is no variation, thus the variation is 0.0, so is its std:
const = pd.Series([2, 2, 2])

print(const.var())
print(const.std())

#Note that in Python, .var() will return the variance divided by N-1 where N is the length of the data



0.0
0.0


In [113]:
print(presidents_df['age'].var())
print(presidents_df['age'].std())
print(presidents_df.std())


43.5
6.59545297913646
order     13.136502
age        6.595453
height     6.977236
dtype: float64


  print(presidents_df.std())


In [114]:
print(presidents_df['age'].describe())
print(presidents_df.describe())

"""
.describe() ignores the null values, 
such as `NaN` (Not a Number) 
and generates the descriptive statistics that summarize the central tendency (i.e., mean), 
dispersion (i.e., standard deviation), and shape (i.e., min, max, and quantiles) of a dataset’s distribution.
"""

count    45.000000
mean     55.000000
std       6.595453
min      42.000000
25%      51.000000
50%      55.000000
75%      58.000000
max      70.000000
Name: age, dtype: float64
           order        age      height
count  45.000000  45.000000   45.000000
mean   23.022222  55.000000  180.000000
std    13.136502   6.595453    6.977236
min     1.000000  42.000000  163.000000
25%    12.000000  51.000000  175.000000
50%    23.000000  55.000000  182.000000
75%    34.000000  58.000000  183.000000
max    45.000000  70.000000  193.000000


'\n.describe() ignores the null values, \nsuch as `NaN` (Not a Number) \nand generates the descriptive statistics that summarize the central tendency (i.e., mean), \ndispersion (i.e., standard deviation), and shape (i.e., min, max, and quantiles) of a dataset’s distribution.\n'

In [115]:
print(presidents_df['party'].value_counts())

republican               19
democratic               15
democratic-republican     4
whig                      4
none                      1
federalist                1
national union            1
Name: party, dtype: int64


In [116]:
print(presidents_df['party'].describe())

count             45
unique             7
top       republican
freq              19
Name: party, dtype: object


In [117]:
"""
Summary statistics provides us with a large amount of information put as simply as possible. 
The measure of location, median, is more robust than mean, 
for continuous variables as the latter is sensitive to outliers, e.g., extremely large values.
"""

'\nSummary statistics provides us with a large amount of information put as simply as possible. \nThe measure of location, median, is more robust than mean, \nfor continuous variables as the latter is sensitive to outliers, e.g., extremely large values.\n'

In [118]:
"""
Summary statistics on an entire dataset provides a good overall view, 
but often we’re interested in some calculation conditional upon a given label or category. 
For example, what is the average height conditional of the presidents party?

To find the value based on a condition, we can use the groupby operation. 
Think of groupby doing three steps: split, apply, and combine. 
The split step breaks the DataFrame into multiple DataFrames based on the value of the specified key; 
the apply step is to perform the operation inside each smaller DataFrame; 
the last step combines the pieces back into the larger DataFrame
"""

'\nSummary statistics on an entire dataset provides a good overall view, \nbut often we’re interested in some calculation conditional upon a given label or category. \nFor example, what is the average height conditional of the presidents party?\n\nTo find the value based on a condition, we can use the groupby operation. \nThink of groupby doing three steps: split, apply, and combine. \nThe split step breaks the DataFrame into multiple DataFrames based on the value of the specified key; \nthe apply step is to perform the operation inside each smaller DataFrame; \nthe last step combines the pieces back into the larger DataFrame\n'

In [119]:
"""
The .groupby("party") returns a DataFrameGroupBy object, 
not a set of DataFrames. To produce a result, 
apply an aggregate (.mean()) to this DataFrameGroupBy object
"""

presidents_df.groupby('party').mean()


Unnamed: 0_level_0,order,age,height
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
democratic,26.066667,52.6,181.066667
democratic-republican,4.5,57.25,176.5
federalist,2.0,61.0,170.0
national union,17.0,56.0,178.0
none,1.0,57.0,189.0
republican,29.631579,55.263158,180.894737
whig,11.0,58.25,176.0


In [120]:
presidents_df.groupby('party').median()


Unnamed: 0_level_0,order,age,height
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
democratic,28.0,52.0,180.0
democratic-republican,4.5,57.0,177.0
federalist,2.0,61.0,170.0
national union,17.0,56.0,178.0
none,1.0,57.0,189.0
republican,29.0,54.0,182.0
whig,11.0,57.5,174.0


In [121]:
"""
We can also perform multiple operations on the groupby object using .agg() method. 
It takes a string, a function, or a list thereof. 
For example, we would like to obtain the min, median, and max values of heights grouped by party:
"""

print(presidents_df.groupby('party')['height'].agg(['min', np.median, max]))

                       min  median  max
party                                  
democratic             168   180.0  193
democratic-republican  163   177.0  189
federalist             170   170.0  170
national union         178   178.0  178
none                   189   189.0  189
republican             168   182.0  193
whig                   173   174.0  183


In [122]:
print(presidents_df.groupby('party')[['height', 'age']].agg(['median', 'mean']),"\n\n\n")
#we would like to check the median and mean of heights, but minimum and maximum for ages, grouped by party
print(presidents_df.groupby('party')\
    .agg({'height': [np.median, np.mean],
        'age':    [min, max]}))


                      height                age           
                      median        mean median       mean
party                                                     
democratic             180.0  181.066667   52.0  52.600000
democratic-republican  177.0  176.500000   57.0  57.250000
federalist             170.0  170.000000   61.0  61.000000
national union         178.0  178.000000   56.0  56.000000
none                   189.0  189.000000   57.0  57.000000
republican             182.0  180.894737   54.0  55.263158
whig                   174.0  176.000000   57.5  58.250000 



                      height             age    
                      median        mean min max
party                                           
democratic             180.0  181.066667  43  65
democratic-republican  177.0  176.500000  57  58
federalist             170.0  170.000000  61  61
national union         178.0  178.000000  56  56
none                   189.0  189.000000  57  57
republican    