In [None]:
import pandas as pd

df = pd.read_csv('data/titanic.csv')
df.head(3)

### Useful & common properties

In [None]:
# Shape: Gives tuple with number of rows and columns
df.shape  

# Dtypes and dtype: Tells you the data type of the values in the DataFrame/Series
df.Age.dtype  # For Series it's dtype (no "s")
df.dtypes     # For DataFrame it's dtypes (with "s")

# Returns listlike object with col/index names
df.index  
df.columns

# Properties might have sub-props too.  
df.index.name = "PandasDefaultId"  # Rename the index column
df.head(3)

## Handy methods

### Statistically relevant methods

In [None]:
# Head
df.head()        # Shows the first 5 values (5 is the default number)
df.Age.head(10)  # Shows the first 10 values 
df.Age.tail(10)  # Shows the last 10 columns

# Describe (Series and DataFrame)
df.Age.describe()
df.describe()

# Mean (Series and DataFrame)
df.Age.mean()  # Returns the mean value
df.mean()      # Returns a Series with all the relevant means with the DataFrame's columns as index

# Median
df.Age.median()  # Returns the median value
df.median()      # Returns a Series with all the relevant medians with the DataFrame's columns as index

# Std
df.Age.std()  # Returns the standard deviation value for "Age" column.
df.std()      # Returns a Series with all the relevant standard deviations with the Dataframe's columns as index

# Var
df.Age.var()  # Returns the variance values for "Age" columns.
df.var()      # Returns a Series with all the relevant variances with the Dataframe's columns as index.

# Uniques
df.Sex.unique()           # Returns an Array of the unique values present in a series.
df.nunique(axis='index')  # Returns an Array of the number of unique values per column in a DataFrame

# Value counts
df.Survived.value_counts()  # Series with the unique values and the number of their occurance 

# idxmax()/idxmin()
df.Age.idxmax()  # Throws the id of the max value
df.Age.idxmin()  # Throws the id of the min value

### NaN, None, Null & duplicate values handling

In [None]:
# Count nulls/non-nulls in a Series/DataFrame
df.Cabin.isnull().sum()   # Gives the number of nulls in the "Cabin" column
df.Cabin.notnull().sum()  # Gives the number of non-nulls in the "Cabin" column
df.isnull().sum()         # Returns a Series with the number of nulls per columns in the DataFrame
df.notnull().sum()        # Returns a Series with the number of non-nulls per columns in the DataFrame

# Fill all NaN (not a number, Null values)
df.Cabin.fillna('Unknown')  # Can also add the kwarg "inplace=True"

# Eliminate rows with NaN values
df.dropna()  # You can "inplace=True" 

# Drop duplicates
df.drop_duplicates()                        # It drops duplicates
df.drop_duplicates(subset=['Age', 'Sex'])   # It drops duplicates and it considers duplicates every row that matches values in all features of the subset columns

### Sorting & renaming (by) index/columns

In [None]:
# Sorting values
df.Age.sort_values()
df.sort_values(by=['Age', 'Pclass'], ascending=False)
df.sort_values(by='Age', ignore_index=True)  # ignore_index=True prevents the resulting DataFrame to have indexes all over the place (having indexes all over the place is dangerous if planning to make arithmetical operations or make de DataFrame keep this structure indefinately)

# Sorting indexes
df.sort_index(ascending=False)

# Rename 
df.rename(
    columns={'Pclass': 'BoardingClass', 'Sex': 'Gender'}, # Renames the specified column values
    index={0: 'First', 1: 'Second'}                       # Renames the specified index values
)

# Rename the axis itself (Sometimes you want to rename de axis, not the columns or the indexes themselves, usually the case with index axis)
df.rename_axis('features', axis='columns').rename_axis('indexes', axis='index')

# Resetting and Setting columns as index
df.reset_index()                # Resets indexes and revert them to columns (Pandas will add the standard 0-n indexing by default). Can use "inplace=True". 
df.set_index(['PassengerId'])   # Sets all columns send as indexes (can be more than one). Can use "inplace=True".

### Conditional indexing

In [None]:
# Simplest version
df[df.Age>30]  # Returns all rows of df which Age values are above 30

# More complex boolean conditions
mask1 = df.Age > 30
mask2 = df.Age < 40
df[mask1 & mask2]  # Returns all rows of df which Age values are above 30 AND below 40

# Complex conditions for rows and specific columns (loc)
df.loc[mask1 & mask2, ['Name', 'Age']]  # Returns the Name and Age of all rows from original df with Age values above 30 and below 40

# Is in (throws booleans that work as a mask for conditional indexing)
df.loc[df.Age.isin([30, 31, 32, 33])]  # Returns the rows which Age value is either 30, 31, 32 or 33

# In Pandas, these type of masks are boolean Series objects and their values can be summed up
(mask1).sum()  # Returns the number of entries with Age greater than 30

# Select all rows that have a certain value in any for their fields. Ref: https://stackoverflow.com/questions/27020312/drop-row-in-pandas-dataframe-if-any-value-in-the-row-equals-zero
df[(df!=0).all(1)]                  # Returns all rows of df that have NO 0 amongst it's values.
df[(df!=0).all(axis='columns')]     # Another way to write the same thing

# Invert truth values of masks or entire DataFrames
~(mask1)
~(df == 0)

### Data replacement, modification, deletion & dtype handling

In [None]:
# Convert all data entries in a DataFrame or more likely in a Series to a dtype (remember strings are objects for Pandas)
check = df.Age.astype('str')       # With native method (best)
check = df.Age.apply(str)          # With apply

# Replace all instances of something (3) with something else ('Lo bueno es que hay salud')
df.Pclass.replace(3, 'Lo bueno es que hay salud :(')  # Can also add the kwarg "inplace=True"

# Drop
df.drop(df[df.Age > 30].index)  # It recieves a listlike object of indexes to drop. It's not inplace (creates copy).
df.drop(['PassengerId', 'Pclass'], axis='columns')

# Drop all rows that have a certain value in any for their fields.
df.drop(df[~(df!=0).all(1)].index)  # Ref: https://stackoverflow.com/questions/27020312/drop-row-in-pandas-dataframe-if-any-value-in-the-row-equals-zero

### Further reading
[Summary functions and Map](https://www.kaggle.com/residentmario/summary-functions-and-maps/tutorial)<br>
[Drop rows](https://statisticsglobe.com/delete-rows-in-pandas-dataframe-conditionally-python#example-2-remove-rows-of-pandas-dataframe-using-drop-function-index-attribute)<br>
[Drop columns](https://statisticsglobe.com/delete-column-of-pandas-dataframe-in-python#example-3-remove-multiple-columns-from-pandas-dataframe-by-index-position)<br>