# DataFrame



In [None]:
import pandas as pd

## Methods in pandas
Methods in pandas used to define

* .head()             $\;\;\;\;\;\;\;$ shows only the first 5 rows 

* .info()             $\;\;\;\;\;\;\;\;$  column information, e.g. # null values
* .apply()            $\;\;\;\;\;\;\;$  add new column with values -- () len, str.upper
* .describe()         $\;\;\;\;\;$ some summary statistics                                             
* .isna()             $\;\;\;\;\;\;\;\;$ True if is NaN (non value)
* .isna().any()       $\;\;\;\;\;\;\;\;$ True if NaN in column
* .isna().sum()
* .dropna()           $\;\;\;\;\;\;\;\;$ remove NaN (whole rows)
* .fillna(0)          $\;\;\;\;\;\;\;\;$ replace NaN (example with 0)
* .value_counts()
* .columns            $\;\;\;\;\;\;\;\;$ column names
* .values             $\;\;\;\;\;\;\;\;$ all values
* .index              $\;\;\;\;\;\;\;\;$ index for rows
* .shape

Methods in pandas used for statistics
* .mean()            $\;\;\;\;\;\;\;\;\;\;\;\;\;$ axis = 'index' (across rows), axis = 'columns' (across columns)
* .median()
* .mode()
* .min()
* .max()
* .sum()
* .quantile()
* .std()
* .var()
* .agg()             
* .cumsum()
* .cummin()
* .cummin()
* .cumprod()

Methods in pandas to order and clean
* .sort_values()
* .isin() 
* .drop_duplicates()

In [None]:
# Aggregation option 1
df['column'].agg(function) 

# Aggregation option 2
or .agg([f1, f2, ...]) #like .argg([np.mean, np.median])

# Aggregation option 3
.agg([f1, f2, ...]) # like .argg([np.mean, df.value_counts()

# Aggregation option 4
.agg({'columnname':'count'})

# Sort values
.sort_values('col_name', ascending = True)

# Value is in a column
.isin(['value']) 

# Value is in a list
.isnin(var_listname)

# Drop duplicates
.drop_duplicates(subset = ['column',''])

# Value_counts
.value_counts('', sort = True, normalize = True)

## Data into DataFrame

### Import

In [None]:
# Load CSV file into DataFrame
df = pd.read_csv('file.csv', index_col = 0) 

# Load dictionary into DataFrame
df = pd.DataFrame(dict, index_col = 0)                                                  

# Print DataFrame
print(df)

In [None]:
# Panda series is displayed
df['column_name']                                   

# Panda DataFrame is displayed
df[['column_name']]                                

### Export

In [None]:
# Export DataFrame into CSV file
df.to_csv('title_of_new_csv_file.csv')

## Selecting

### Columns

In [None]:
# Option 1
select_column = df['column_name']

# Option 2
select_column = df.column_name  

# Option 3
select_column = df[['column_name']]

# Option 4: The .query('statement') is similar to WHERE in SQL statement
table.query(' columnname = "x" and columname = "y" ')

### Rows

In [None]:
# Option 1
select_allcoloms_row = df[df['column_name'] == 'variable_name']     

# Option 2
select_allcoloms_row = df[df.column_name == 'variable_name']     

### Column names

In [None]:
for names in df : 
    print(names)

### Datetime selecting

In [None]:
df['date_column'].dt.year
df['date_column'].dt.month

## Indexing

### Define the index

In [None]:
# Name indexing
df.index = ['a','c','b'] 

# Define column(s) to become indexes
df.set_index('col_name')
df.set_index(['col1', 'col2'])

# Remove indexes
df.reset_index() 

# Sort index
.sort_index(level = ['',''], ascending = [True, False])

### .loc
Indexing by names

In [None]:
# Selection row 
df[[1:4]]                                           

# Selection rows, columns, rows&columns = labeled-based
df.loc[['index_name'],[column_name]]                    

# Select indexes and rows
.loc[('indA1','indB1') : ('indA2','indB2')]             

# Select all rows from defined columns
.loc[:,'column1':'column2']   

# Created new column with value for index 'index_name'
df.loc['index_name', 'new_column_name'] = value 

### .iloc
Indexing by numbers

In [None]:
# Selection rows and columns
df.iloc[[index_nr],[column_nr]]            

## Pivot table

### Group by

### Pivoting

bla bla bla
* default = mean, value ~ sum/agg column, index ~groupby, columns ~ show columns                                                 * other stat function: aggfunc = np.func or [np.func1, np.func2]
* more groupby variables: colums = ''
* fill_values = 0 (not NaN)
* margins = True --> all col,row --> mean stats

In [None]:
table.pivot_table(values = '', index = '')                   

## JOIN

There are different types of joins on values:
* inner join
* left join
* right join
* self join

A self join merges a table to itself can be useful when you want to compare values in a column to other values in the same column.

Joins on datetimes has a specific join defined:
* merge_ordered
* merge_asof

A merge asof matches on the nearest key not the exact matches. The columns must be sorted. A merge_asof() is similar to a left join and merged_ordered().

A suffix identifies from which table the column came from if they have a same column name by self defined name

### Value joins

In [None]:
# Merge basics
new_df = df1.merge(df2, on = 'columnname', suffixes=('_tab1', '_tab2'))      
  
# Merge with different types of joins
new_df = df1.merge(df2, how = 'left', left_on = 'columnname', right_on = 'columnname')
  
# Merge on multiple columns 
new_df = df1.merge(df2, on = ['columnname1', 'columnname2'])  

# Merge on index
new_df = df1.merge(df2, on = 'id')                         
    
# Merge on index with left_on (and right_on)
new_df = df1.merge(df2, how = 'left', left_on = 'columnname', left_index = True, right_on = 'columnname', right_index = True)

# self-join 
new_df = df1.merge(df1, on = 'columnname', suffixes=('_tab1', '_tab2'))


### Datetime joins

In [None]:
# Merge_asof
pd.merge_asof(df1, df2, on = '', how = '', direction = 'nearest') # direction = 'forward', 'backward'