# Statistical methods, Map, Apply & compare

### Initialization

In [4]:
# Imports
import pandas as pd

# Configuration (you can set a default config in a given notebook for your Pandas instance programatically), e.g.:
pd.set_option('max_rows', 6)  # This limits the df outputs to, at most, 6 rows.

# Generating DataFrame with data set
df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q


### Common statistical methods

In [5]:
# Describe (Series and DataFrame)
df.Age.describe()
df.describe()

# Mean (Series and DataFrame)
df.Age.mean()  # Returns the mean value
df.mean()      # Returns a Series with all the relevant means from the DataFrame

# Median
df.Age.median()  # Returns the median value
df.median()      # Returns a Series with all the relevant median from the DataFrame

# Std
df.Age.std()  # Returns the standard deviation value for "Age" column.
df.std()      # Returns a Series with all the relevant standard deviations with the Dataframe's columns as index

# Var
df.Age.var()  # Returns the variance values for "Age" columns.
df.var()      # Returns a Series with all the relevant variances with the Dataframe's columns as index.

# idxmax()/idxmin()
df.Age.idxmax()  # Throws the id of the max value
df.Age.idxmin()  # Throws the id of the min value

803

### Map vs. Apply

In [6]:
# Map (recieves a lambda function or a defined function, to perform over a Series) (works only for Series)
mean_age = df.Age.mean()
df.Age.map(lambda age: age - mean_age)

0     -7.699118
1      8.300882
2     -3.699118
         ...   
888         NaN
889   -3.699118
890    2.300882
Name: Age, Length: 891, dtype: float64

In [9]:
# Defining a function (the x will be either a row or a column, depending on the defined axis)
def age_gender_grouping(x):
    age_gender = ''
    if(x.Pclass == 3): age_gender += 'Poor'
    elif(x.Pclass == 2): age_gender += 'Accomodated'
    else: age_gender += 'Rich'
    
    if(x.Sex == 'male'): age_gender += ' guy'
    else: age_gender += ' gal'
    
    return age_gender


# Apply (similar to map but works for a DataFrame), axis=1 or axis=column means ALONG the column axis (every iteration is a row)
df_apply = df.copy()
df_apply['Age Group'] = df_apply.apply(age_gender_grouping, axis=1)  # Do not write the () after the function's name, you want to pass the function as a callback, not to call it!
df_apply.head()

# You can also do this with a more explicit axis kwarg
df_apply['Age Group'] = df_apply.apply(age_gender_grouping, axis='columns')
df_apply.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Poor guy
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Rich gal
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Poor gal
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Rich gal
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Poor guy


### Applymap()
In case you want to apply a function to EVERY VALUE of a dataframe you can use applymap(), this is usefull for dtype conversions and more complex functions

In [11]:
# Convert all data in DataFrame to str
converted_df = df.applymap(str)
print(f'Before:\n{df.dtypes}\n\n\nAfter:\n{converted_df.dtypes}')

Before:
PassengerId      int64
Survived         int64
Pclass           int64
                ...   
Fare           float64
Cabin           object
Embarked        object
Length: 12, dtype: object


After:
PassengerId    object
Survived       object
Pclass         object
                ...  
Fare           object
Cabin          object
Embarked       object
Length: 12, dtype: object


### Sending arguments to apply
[Send arguments to apply() link](https://stackoverflow.com/questions/12182744/python-pandas-apply-a-function-with-arguments-to-a-series#answer-43460506)

In [12]:
def my_func(x, extra_arg):
    print(f'This is x: {x} and this is my extra_arg: {extra_arg}')
    return

_ = df.Name[:3].apply(my_func, extra_arg=10000)

This is x: Braund, Mr. Owen Harris and this is my extra_arg: 10000
This is x: Cumings, Mrs. John Bradley (Florence Briggs Thayer) and this is my extra_arg: 10000
This is x: Heikkinen, Miss. Laina and this is my extra_arg: 10000


### Getting a max/min value row

In [46]:
# Using sort
df.sort_values(by='Age', ascending=False).iloc[0]

# Using idxmax (returns the id of the max value)
df.iloc[df.Age.idxmax()]

PassengerId     631
Survived          1
Pclass            1
               ... 
Fare           30.0
Cabin           A23
Embarked          S
Name: 630, Length: 12, dtype: object

### Comparing 2 Series
[Reference link](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.compare.html)

In [13]:
ser1 = pd.Series(['1', '2', '3', '4', '5'])
ser2 = pd.Series(['1', '0', '3', '9', '5'])

# Making a mask of where the values are equal
equal_mask = (ser1 == ser2)
print(f'The values that coincide between the Series are:\n{equal_mask}')


# Visualize a comparison (showing only the values that DIFFERENT)
df_difference = ser1.compare(ser2) 

# Visualize all values side-by-side, regardless if they're equal or not
df_complete_comparison = ser1.compare(ser2, keep_shape=True, keep_equal=True) 
df_complete_comparison

The values that coincide between the Series are:
0     True
1    False
2     True
3    False
4     True
dtype: bool


Unnamed: 0,self,other
0,1,1
1,2,0
2,3,3
3,4,9
4,5,5
