In [1]:
import pandas as pd

### Mapping:

##### 1) Using Maps:
In data science we often have a need for creating new representations from existing data, or for transforming data from the format it is in now to the format that we want it to be in later. Maps are really usefull in this regard.

In [84]:
df = pd.read_csv("train.csv")

In [85]:
# remean the age column:
mean_age = df.Age.mean()
df["Age"] = df["Age"].map(lambda x: x - mean_age)

In [87]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3.0,"Braund, Mr. Owen Harris",male,-7.699118,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,8.300882,1.0,0.0,,71.2833,C85,C
2,3,1,3.0,"Heikkinen, Miss. Laina",female,-3.699118,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,5.300882,1.0,0.0,113803,,C123,S
4,5,0,3.0,"Allen, Mr. William Henry",male,5.300882,,0.0,373450,8.05,,


##### 2) Using apply:
apply is the equivalent method if we want to transform a whole DataFrame by calling a custom method on each row.

In [88]:
df = pd.read_csv("train.csv")

In [89]:
mean_age = df.Age.mean()
def remean_age(row):
    row["Age"] = row["Age"] - mean_age
    return row

df = df.apply(remean_age, axis='columns')

In [90]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3.0,"Braund, Mr. Owen Harris",male,-7.699118,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,8.300882,1.0,0.0,,71.2833,C85,C
2,3,1,3.0,"Heikkinen, Miss. Laina",female,-3.699118,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,5.300882,1.0,0.0,113803,,C123,S
4,5,0,3.0,"Allen, Mr. William Henry",male,5.300882,,0.0,373450,8.05,,


### Grouping in pandas

In [91]:
df = pd.read_csv("train.csv")

In [92]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,,71.2833,C85,C
2,3,1,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,,C123,S
4,5,0,3.0,"Allen, Mr. William Henry",male,35.0,,0.0,373450,8.05,,


In [93]:
df["Sex"].value_counts()

male      547
female    300
Name: Sex, dtype: int64

In [94]:
# above is equivalent to:
df.groupby("Sex").Sex.count() # df.groupby("Sex") makes groups based on different Sex. then we grab the Sex column from these groups and count the number.

Sex
female    300
male      547
Name: Sex, dtype: int64

In [95]:
# finding max male age and max female age:
df.groupby("Sex").Age.max()

Sex
female    63.0
male      80.0
Name: Age, dtype: float64

In [98]:
# You can think of each group we generate as being a slice of our DataFrame containing only data with values that match. This DataFrame is accessible to us directly using the apply() method, and we can then manipulate the data in any way we see fit
x = df.groupby('Age').apply(lambda df: df.Name)
print(x,"\n")
print("passenger having age of 0.75: \n", x[0.75])

Age       
0.42   803         Thomas, Master. Assad Alexander
0.67   755               Hamalainen, Master. Viljo
0.75   469           Baclini, Miss. Helene Barbara
       644                  Baclini, Miss. Eugenie
0.83   78                                      NaN
                              ...                 
70.50  116                    Connors, Mr. Patrick
71.00  96                Goldschmidt, Mr. George B
       493                 Artagaveytia, Mr. Ramon
74.00  851                     Svensson, Mr. Johan
80.00  630    Barkworth, Mr. Algernon Henry Wilson
Name: Name, Length: 714, dtype: object 

passenger having age of 0.75: 
 469    Baclini, Miss. Helene Barbara
644           Baclini, Miss. Eugenie
Name: Name, dtype: object


In [99]:
df.loc[[469, 644],:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
469,470,1,3.0,"Baclini, Miss. Helene Barbara",female,0.75,2.0,1.0,2666,19.2583,,C
644,645,1,3.0,"Baclini, Miss. Eugenie",female,0.75,2.0,1.0,2666,19.2583,,C


In [100]:
# Another groupby() method worth mentioning is agg(), which lets you run a bunch of different functions on your DataFrame simultaneously. 
# For example, we can generate a simple statistical summary of the dataset as follows:
df.groupby(['Sex']).Age.agg([len, min, max])

Unnamed: 0_level_0,len,min,max
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,300,0.75,63.0
male,547,0.42,80.0
