# Summary Functions and Maps

In [None]:
#This notebook shows Pandas basic applications for summary functions and maps which extract insights from your data. 
#The DataFame used is titanic which shows information related to the Titanic vessel DataFrame.
#This DF has 891 rows and 12 columns detailing information about the passangers details (such as name, sex, age, ticket, fare, cabin, survived, embarked status and Pclass)  

In [46]:
import pandas as pd
import numpy as np
from pandas import ExcelFile 

In [43]:
titanic = pd.read_csv('titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


##  Summary Functions

In [22]:
#Functions provided by Pandas to restructure the data in some useful way.
#.describe() : to generate a high-level summary of the attributes of a given column, for numerical data
titanic.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [23]:
# for a string 
titanic.Cabin.describe()

count     204
unique    147
top        G6
freq        4
Name: Cabin, dtype: object

In [26]:
# getting a mean from a given column
titanic.Fare.mean()

32.2042079685746

In [27]:
# To see a list of unique values we can use the unique() function:gives you an array
titanic.Ticket.unique()

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788',
       '347077', '2631', '19950', '330959', '349216', 'PC 17601',
       'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
       'A./5. 2152', '345764', '2651', '7546', '11668', '349253',
       'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311',
       '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
       '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144',
       '2669', '113572', '36973', '347088', 'PC 17605', '2661',
       'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
       'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746',
       '248738', '364516', '345767', '345779', '330932', '113059',
       'SO/C 14885', '31012

In [None]:
#To see a list of unique values and how often they occur in the dataset, we can use the value_counts() method:

In [29]:
titanic.Fare.value_counts()

8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64

## Maps

In [31]:
#Map is a that takes one set of values and "maps" them to another set of values. 
#Use for creating new representations from existing data, or for transforming data from the format. 
#The function expect a single value from the Series. 
#It returns a new Series where all the values have been transformed by the function. 
titanic_Fare_mean = titanic.Fare.mean()
titanic.Fare.map(lambda p : p - titanic_Fare_mean)

0     -24.954208
1      39.079092
2     -24.279208
3      20.895792
4     -24.154208
         ...    
886   -19.204208
887    -2.204208
888    -8.754208
889    -2.204208
890   -24.454208
Name: Fare, Length: 891, dtype: float64

In [33]:
#apply() function is the equivalent method if you want to transform a whole DataFrame by calling a custom method on each row.
#Returns new transformed DataFrame
def remean_Fare(row):
    row.Fare = row.Fare - titanic_Fare_mean
    return row

titanic.apply(remean_Fare, axis='columns')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,-24.954208,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,39.079092,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,-24.279208,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,20.895792,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,-24.154208,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,-19.204208,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,-2.204208,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,-8.754208,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,-2.204208,C148,C


In [34]:
#mapping operations for remeaning a column. 
#This code performs an operation between a lot of values on the left-hand side and a single value on the right-hand side (the mean value)
titanic_Fare_mean = titanic.Fare.mean()
titanic.Fare - titanic_Fare_mean

0     -24.954208
1      39.079092
2     -24.279208
3      20.895792
4     -24.154208
         ...    
886   -19.204208
887    -2.204208
888    -8.754208
889    -2.204208
890   -24.454208
Name: Fare, Length: 891, dtype: float64

In [48]:
bikes = pd.read_csv("bikeshare-Copy1.csv")
bikes.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [50]:
bikes.cnt + bikes.casual + bikes.registered

0         32
1         80
2         64
3         26
4          2
        ... 
17374    238
17375    178
17376    180
17377    122
17378     98
Length: 17379, dtype: int64

In [52]:
bikes.cnt * bikes.casual * bikes.registered

0           624
1         10240
2          4320
3           390
4             0
          ...  
17374    141372
17375     57672
17376     52290
17377     38064
17378     21756
Length: 17379, dtype: int64

In [54]:
bikes.cnt - bikes.registered - bikes.casual

0        0
1        0
2        0
3        0
4        0
        ..
17374    0
17375    0
17376    0
17377    0
17378    0
Length: 17379, dtype: int64

In [55]:
bikes.registered >= bikes.casual

0        True
1        True
2        True
3        True
4        True
         ... 
17374    True
17375    True
17376    True
17377    True
17378    True
Length: 17379, dtype: bool

In [56]:
bikes.registered == bikes.casual

0        False
1        False
2        False
3        False
4        False
         ...  
17374    False
17375    False
17376    False
17377    False
17378    False
Length: 17379, dtype: bool

In [51]:
#combining the remeaning operations with two columns. The other operators such as >, <, =, - can be also used.
titanic.Fare + titanic.Age

0       29.2500
1      109.2833
2       33.9250
3       88.1000
4       43.0500
         ...   
886     40.0000
887     49.0000
888         NaN
889     56.0000
890     39.7500
Length: 891, dtype: float64