## Pandas Basics
pandas takes different datatypes
- pandas handles dataframes (df) - explore, clean, process tabular data
- supports: csv, excel, sql, json, parquet... import with read_*
- slicing, selecting, filtering w. conditions
- plotting: scatter, var, boxplot
- basic statistics: mean, median, min/max, counts
- structure datatables w. melt() and pivot()
- concatenate column and row wise to combine multiple tables
- times-series
- clean textual data and extract information

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html
https://www.dataschool.io/best-python-pandas-resources/

# imports

In [1103]:
import pandas as pd
import numpy as np
import statistics as st

# Create a dataframe

columnwise

In [1104]:
df1 = pd.DataFrame(
{"a" : [1 ,4, 7],       # col a
 "b" : [2, 5, 8],       # col b
 "c" : [3, 6, 9]},    
index = [1, 2, 3])      # row index

df1

Unnamed: 0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


rowwise

In [1105]:
df2 = pd.DataFrame(
[[1, 2, 3],      # row 1
[4, 5, 6],       # row 2
[7, 8, 9]], 
index=[4, 5, 6], 
columns=['a', 'b', 'c'])   # column names

df2

Unnamed: 0,a,b,c
4,1,2,3
5,4,5,6
6,7,8,9


In [1106]:
df_q = pd.DataFrame(np.arange(2, 100, 2).reshape(7,7))
df_q

Unnamed: 0,0,1,2,3,4,5,6
0,2,4,6,8,10,12,14
1,16,18,20,22,24,26,28
2,30,32,34,36,38,40,42
3,44,46,48,50,52,54,56
4,58,60,62,64,66,68,70
5,72,74,76,78,80,82,84
6,86,88,90,92,94,96,98


# Data Cleaning
Use these commands to perform a variety of data cleaning tasks.

## access cell - at[] / iat[]

In [1107]:
df1.iat[1, 1] = np.nan  # set value using row and column integer positions
df1.at[2, 'c'] = np.nan  # set value using row and column labels
df1

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,
3,7,8.0,9.0


## select columns

In [1108]:
df1.a # column
df1['a']
df1.loc[:, 'a'] # all rows of column a

1    1
2    4
3    7
Name: a, dtype: int64

In [1109]:
df1.loc[: , ['a', 'b']] # all rows of column a & b
df1[['a', 'b']] # col a&b

Unnamed: 0,a,b
1,1,2.0
2,4,
3,7,8.0


## select rows       

In [1110]:
df1.loc[1] # by index
df1.loc[[1,2]] # multiple rows in double brackets

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,


## access by row and column

In [1111]:
df1

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,
3,7,8.0,9.0


In [1112]:
df1.loc[3, 'b'] # access value by row and column
df1.loc[1, ["a", "c"]] # row  1 / col a & c
df1.loc[[1, 3], "b"] # multiple rows same column
df1.loc[[1, 2], ['a', 'c']]  # multiple rows multiple columns


Unnamed: 0,a,c
1,1,3.0
2,4,


## iloc() - select by (internal) index / position 

In [1113]:
# row with index position 0
# indexing starts from zero 
df1.iloc[0]
df1.iloc[1]


a    4.0
b    NaN
c    NaN
Name: 2, dtype: float64

In [1114]:
df1.iloc[:, 1] # column by index position

1    2.0
2    NaN
3    8.0
Name: b, dtype: float64

In [1115]:
# Select data at the specified row and column location
df1.iloc[0, 0]

1

In [1116]:
# Select list of rows and columns by position
df1.iloc[[1, 2], [0, 1]]

Unnamed: 0,a,b
2,4,
3,7,8.0


## slicing

In [1117]:
df1.loc[1:2, :] # get rows - from : to & all columns
df1.loc[:, "b":"c"] # get all row from col 'b' & 'c'
df1.loc[1:3, "b":"c"]  # get rows from : to, of columns from:to
df1

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,
3,7,8.0,9.0


In [1118]:
df1.iloc[0:3, :]  #  Slicing Rows and Columns by position
df1.iloc[:, 0:3]  # slice columns by index position.
df1.iloc[0:2, 1:3]  # slice row and columns by index position.
df1.iloc[:2, :2]  # row index 0 to index 1 (exclusive 2), column zero to one

Unnamed: 0,a,b
1,1,2.0
2,4,


## subsetting by boolean condition

In [1119]:
df1

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,
3,7,8.0,9.0


In [1120]:
# select all rows whose column contain the specified value(s)
df1[df1.a > 4]

Unnamed: 0,a,b,c
3,7,8.0,9.0


In [1121]:
# get row where col a has 1 or 7
df1[df1.a.isin([1, 7])]

Unnamed: 0,a,b,c
1,1,2.0,3.0
3,7,8.0,9.0


## rows with multiple conditions

In [1122]:
df1[(df1.a == 7) | (df1.c == 9)]

Unnamed: 0,a,b,c
3,7,8.0,9.0


## conditional exclude 

In [1123]:
# show only row that have no 8 or 13 in column 'b'
df1[~df1.b.isin([8,13])] 

Unnamed: 0,a,b,c
1,1,2.0,3.0
2,4,,


## filter columns

In [1124]:
df1.filter(items=["a","c"])

Unnamed: 0,a,c
1,1,3.0
2,4,
3,7,9.0


## conditional rows

In [1125]:
df1[(df1["a"] > 2) & (df1["c"] <=10)]  # Rows where two condition hold
df1[df1['a'] > 5]  # values > x in col a

Unnamed: 0,a,b,c
3,7,8.0,9.0


In [1126]:
is_two = df1.b == 2  # where has col b twos
is_two.value_counts()  # how many twos are there?
df1[is_two]  # show row where 'b' == 2 = True

Unnamed: 0,a,b,c
1,1,2.0,3.0


## conditional columns

In [1127]:
# select columns whose rows contain the specified value
df1.loc[:, df1.isin([1, 9]).any()]


Unnamed: 0,a,c
1,1,3.0
2,4,
3,7,9.0


## rename columns

In [1128]:
df1.columns = ['x','y','z']               # Rename columns
df1

Unnamed: 0,x,y,z
1,1,2.0,3.0
2,4,,
3,7,8.0,9.0


## isnull / notnull

In [1129]:
df1.isnull()                                # Checks for null Values, Returns Boolean Arrray

Unnamed: 0,x,y,z
1,False,False,False
2,False,True,True
3,False,False,False


In [1130]:
df1.notnull()                                   # Opposite of pd.isnull()

Unnamed: 0,x,y,z
1,True,True,True
2,True,False,False
3,True,True,True


In [1131]:
df1['y'].isnull().values.any() # any null values in column 'y'?
df1['y'].isnull().sum() # number of null values in 'y'
df1.isnull().sum()  # null values per column
df1.isnull().values.any() # null values in df
df1.isnull().sum().sum() # number of null values in df


2

## dropna

In [1158]:
__pd.dropna__.doc()

NameError: name '__pd' is not defined

In [1132]:
df1.dropna() 
print(df1)                                    # Drop all rows that contain null values, this is a passing operation
df1.dropna(inplace=True)        # this changes the df
df1

   x    y    z
1  1  2.0  3.0
2  4  NaN  NaN
3  7  8.0  9.0


Unnamed: 0,x,y,z
1,1,2.0,3.0
3,7,8.0,9.0


## add rows

In [1133]:
df1.loc[len(df1)+1] = [4, np.nan, 9] # add row with index number fitting to length of df
df1.loc[0] = [np.nan, 17, 9] # add row with index 0
df1.loc[df1.index.max()+1] = [0, 1, 2] # add row at the end/index last+1
df1.loc[df1.index.min()-1] = [50, 88, 66] # add row with index first -1
df1.loc[1.5] = [0, 100, 0] # add row between 1 and 2
df1.sort_index(inplace=True) # fix index order, otherwise rows are sattered and indicies are not aligned (e.g. 1.5 at the end)
df1

Unnamed: 0,x,y,z
-1.0,50.0,88.0,66.0
0.0,,17.0,9.0
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
3.0,4.0,,9.0
4.0,0.0,1.0,2.0


## drop rows

In [1134]:
df1.drop(-1, inplace=True)  # drop row by index 
df1

Unnamed: 0,x,y,z
0.0,,17.0,9.0
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
3.0,4.0,,9.0
4.0,0.0,1.0,2.0


In [1135]:
df1.drop([0, 3],  inplace=True) # drop a few rows by index
df1

Unnamed: 0,x,y,z
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
4.0,0.0,1.0,2.0


In [1136]:
df1

Unnamed: 0,x,y,z
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
4.0,0.0,1.0,2.0


In [1137]:
print(df1.index[0])
df1.drop(df1.index[0], inplace=True) # drop row by position
df1

1.0


Unnamed: 0,x,y,z
1.5,0.0,100.0,0.0
4.0,0.0,1.0,2.0


## concat() row

In [1138]:
row = pd.DataFrame([{'x': 1, 'y':2, 'z':3}])
df1 = pd.concat([df1, row], axis=0, ignore_index=True)
df1

Unnamed: 0,x,y,z
0,0.0,100.0,0.0
1,0.0,1.0,2.0
2,1.0,2.0,3.0


In [1139]:
df1.loc[df1.index.max()+1] = [0, np.nan, 2] # add row at the end/index last+1
df1.loc[df1.index.max()+1] = [np.nan, 19, 2] # add row at the end/index last+1
df1.loc[df1.index.max()+1] = [0, 1, np.nan] # add row at the end/index last+1
df1

Unnamed: 0,x,y,z
0,0.0,100.0,0.0
1,0.0,1.0,2.0
2,1.0,2.0,3.0
3,0.0,,2.0
4,,19.0,2.0
5,0.0,1.0,


In [1140]:
df1.drop(df1.index.max(), inplace=True) #drop row with highest index
df1.drop(df1.index.min(), inplace=True) # drop row with smallest index
df1

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
3,0.0,,2.0
4,,19.0,2.0


## dropna

In [1141]:
df1.dropna(subset='y')      # drop row with NA in a specific column

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
4,,19.0,2.0


In [1142]:
df1.loc[df1.index.max()+1] = [0, np.nan, 2] # add row at the end/index last+1
df1.loc[df1.index.max()+1] = [90, 19, 2] # add row at the end/index last+1
df1.loc[df1.index.max()+1] = [0, 1, 56] # add row at the end/index last+1
df1.loc[df1.index.max() + 1] = [100, 1, 56]  # add row at the end/index last+1
df1

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
3,0.0,,2.0
4,,19.0,2.0
5,0.0,,2.0
6,90.0,19.0,2.0
7,0.0,1.0,56.0
8,100.0,1.0,56.0


In [1143]:
# Drop all rows that contain null values
# axis=1 would drop all coumns containing na 
df1.dropna(axis=0, inplace=True)
df1

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
6,90.0,19.0,2.0
7,0.0,1.0,56.0
8,100.0,1.0,56.0


## drop_duplicates()

In [1144]:
df11= pd.DataFrame( [[90, 19, 2], [90, 19, 56], [100, 10, 56]], index=[8,9,10], columns=['x','y','z'])
df11.loc[8.5] = [90, 19, 2] # insert row with index between 8 and 9
df11 = df11.sort_index().reset_index(drop=True) # sort index, otherwise the index will be 8.5 and the rows are all over the place
df11

Unnamed: 0,x,y,z
0,90,19,2
1,90,19,2
2,90,19,56
3,100,10,56


In [1145]:
df11.drop_duplicates() # drop identical rows

Unnamed: 0,x,y,z
0,90,19,2
2,90,19,56
3,100,10,56


In [1146]:
df11.drop_duplicates('z') # drop rows with same values in one column

Unnamed: 0,x,y,z
0,90,19,2
2,90,19,56


## pop

In [1147]:
df2

Unnamed: 0,a,b,c
4,1,2,3
5,4,5,6
6,7,8,9


In [1148]:
# pop retruns the item before dropping
# thus we can put the returned item somewhere else e.g. as a new col to df2
df2['d_z'] = df1.pop('z') 
df2

Unnamed: 0,a,b,c,d_z
4,1,2,3,
5,4,5,6,
6,7,8,9,2.0


In [1149]:
df2.pop('d_z')  # Return item and drop from frame.

4    NaN
5    NaN
6    2.0
Name: d_z, dtype: float64

In [1150]:
df2

Unnamed: 0,a,b,c
4,1,2,3
5,4,5,6
6,7,8,9


## add columns

In [1151]:
df1['s'] = '' # add an empty column to the df
df1['ones'] = 1 # add an empty column to the df
df1['no'] = None # new col filled with None
df1['na'] = np.nan # NA col
df1['Country'] = ['Canada', 'USA', 'Germany']

ValueError: Length of values (3) does not match length of index (5)

In [None]:
df1

Unnamed: 0,x,y,s,ones,no,na,Country
8,90.0,19.0,,1,,,Canada
9,0.0,1.0,,1,,,USA
10,100.0,1.0,,1,,,Germany


In [None]:
df1.loc[df1.index.max()+1] = [0, np.nan, 2, 0, 4,1, 'Bosnia'] 
df1.loc[df1.index.max()+1] = [90, None, 2, 0, 4,6, 'Siberia']
df1.loc[df1.index.max()+1] = ['', 0, 1, 0, 56, 0, 'Croatia']
df1

Unnamed: 0,x,y,s,ones,no,na,Country
8,90.0,19.0,,1,,,Canada
9,0.0,1.0,,1,,,USA
10,100.0,1.0,,1,,,Germany
11,0.0,,2.0,0,4.0,1.0,Bosnia
12,90.0,,2.0,0,4.0,6.0,Siberia
13,,0.0,1.0,0,56.0,0.0,Croatia


## insert() - new column on specific position

In [None]:
df1.insert(loc=2, column='x*y', value=df1.x*df1.y)
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,,1,,,Canada
9,0.0,1.0,0.0,,1,,,USA
10,100.0,1.0,100.0,,1,,,Germany
11,0.0,,,2.0,0,4.0,1.0,Bosnia
12,90.0,,,2.0,0,4.0,6.0,Siberia
13,,0.0,,1.0,0,56.0,0.0,Croatia


## apply function to column 

In [1152]:
def squared(x):
    return x**2


df1['y^2'] = df1.y.apply(lambda x: squared(x)) # inserts column at the end
df1.insert(loc=1, column='x^2', value=df1.x.apply(lambda x: squared(x))) 
df1.insert(loc=2, column='x^4', value=df1.x.apply(lambda x: x**4)) # short function can be written in the apply
df1

Unnamed: 0,x,x^2,y,s,ones,no,na,y^2
1,0.0,0.0,1.0,,1,,,1.0
2,1.0,1.0,2.0,,1,,,4.0
6,90.0,8100.0,19.0,,1,,,361.0
7,0.0,0.0,1.0,,1,,,1.0
8,100.0,10000.0,1.0,,1,,,1.0


In [852]:
df1.at[11, 'x*y'] = 4  # Access a single value for a row/column label pair
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,,1,,,Canada
9,0.0,1.0,0.0,,1,,,USA
10,100.0,1.0,100.0,,1,,,Germany
11,0.0,,4.0,2.0,0,4.0,1.0,Bosnia
12,90.0,,,2.0,0,4.0,6.0,Siberia
13,,0.0,,1.0,0,56.0,0.0,Croatia


## fillna()

In [854]:
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,,1,,,Canada
9,0.0,1.0,0.0,,1,,,USA
10,100.0,1.0,100.0,,1,,,Germany
11,0.0,,4.0,2.0,0,4.0,1.0,Bosnia
12,90.0,,,2.0,0,4.0,6.0,Siberia
13,,0.0,,1.0,0,56.0,0.0,Croatia


In [855]:
# replace null values in one column with some value
df1['y'] = df1['y'].fillna(5)
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,,1,,,Canada
9,0.0,1.0,0.0,,1,,,USA
10,100.0,1.0,100.0,,1,,,Germany
11,0.0,5.0,4.0,2.0,0,4.0,1.0,Bosnia
12,90.0,5.0,,2.0,0,4.0,6.0,Siberia
13,,0.0,,1.0,0,56.0,0.0,Croatia


In [860]:
# replace null in two column with their respective mean
df1[['no', 'na']] = df1[['no', 'na']].fillna(df1[['no', 'na']].mean().round(0))
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,,1,21.0,2.0,Canada
9,0.0,1.0,0.0,,1,21.0,2.0,USA
10,100.0,1.0,100.0,,1,21.0,2.0,Germany
11,0.0,5.0,4.0,2.0,0,4.0,1.0,Bosnia
12,90.0,5.0,,2.0,0,4.0,6.0,Siberia
13,,0.0,,1.0,0,56.0,0.0,Croatia


## replace()

In [861]:
# replace values - create a mapping in a dict
df1.replace({'': 0, 1:7}, inplace=True)
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,0,7,21.0,2.0,Canada
9,0.0,7.0,0.0,0,7,21.0,2.0,USA
10,100.0,7.0,100.0,0,7,21.0,2.0,Germany
11,0.0,5.0,4.0,2,0,4.0,7.0,Bosnia
12,90.0,5.0,,2,0,4.0,6.0,Siberia
13,0.0,0.0,0.0,7,0,56.0,0.0,Croatia


In [863]:
# replace different values in different columns withsome value
df1.replace({'x': 0, 'y':7, 'x*y':0}, 12, inplace=True)
df1


Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,0,7,21.0,2.0,Canada
9,12.0,12.0,12.0,0,7,21.0,2.0,USA
10,100.0,12.0,100.0,0,7,21.0,2.0,Germany
11,12.0,5.0,4.0,2,0,4.0,7.0,Bosnia
12,90.0,5.0,,2,0,4.0,6.0,Siberia
13,12.0,0.0,12.0,7,0,56.0,0.0,Croatia


In [864]:
# replace values seperatly in one column
df1.replace({'s' : {0: 'Dampf', 2: 'Steam'}}, inplace=True)
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7,21.0,2.0,USA
10,100.0,12.0,100.0,Dampf,7,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,0,4.0,7.0,Bosnia
12,90.0,5.0,,Steam,0,4.0,6.0,Siberia
13,12.0,0.0,12.0,7,0,56.0,0.0,Croatia


In [865]:
df1.replace(0, np.nan, inplace=True)
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7.0,21.0,2.0,USA
10,100.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,4.0,7.0,Bosnia
12,90.0,5.0,,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


## ffill()

In [866]:
# NA are filled forward with the preceding values
df1.ffill(axis='rows')

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7.0,21.0,2.0,USA
10,100.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,7.0,4.0,7.0,Bosnia
12,90.0,5.0,4.0,Steam,7.0,4.0,6.0,Siberia
13,12.0,5.0,12.0,7,7.0,56.0,6.0,Croatia


In [867]:
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7.0,21.0,2.0,USA
10,100.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,4.0,7.0,Bosnia
12,90.0,5.0,,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


In [868]:
# NA# are filled columnwise fom left to right
df1.ffill(axis='columns')

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7.0,21.0,2.0,USA
10,100.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,Steam,4.0,7.0,Bosnia
12,90.0,5.0,5.0,Steam,Steam,4.0,6.0,Siberia
13,12.0,12.0,12.0,7,7,56.0,56.0,Croatia


## bfill()

In [871]:
df1.at[10, 'x'] = np.nan
df1.at[11, 'no'] = np.nan
df1.at[9, 's'] = np.nan
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,,7.0,21.0,2.0,USA
10,,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,,7.0,Bosnia
12,90.0,5.0,,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


In [872]:
# NA's are filled backward (from high to small indicies)
df1.bfill(axis=0)

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,Dampf,7.0,21.0,2.0,USA
10,12.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,4.0,7.0,Bosnia
12,90.0,5.0,12.0,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


In [873]:
df1

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,,7.0,21.0,2.0,USA
10,,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,,7.0,Bosnia
12,90.0,5.0,,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


In [874]:
df1.bfill(axis=1) # NA's are filled columnwise right to left

Unnamed: 0,x,y,x*y,s,ones,no,na,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,7.0,7.0,21.0,2.0,USA
10,12.0,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,7.0,7.0,7.0,Bosnia
12,90.0,5.0,Steam,Steam,4.0,4.0,6.0,Siberia
13,12.0,12.0,12.0,7,56.0,56.0,Croatia,Croatia


## rename

In [875]:
df1.columns

Index(['x', 'y', 'x*y', 's', 'ones', 'no', 'na', 'Country'], dtype='object')

In [877]:
df1.rename(columns={
    'x': 'A',
    'y': 'B',
    'x*y':'AB',
    's': 'C',
    'ones': 'D',
    'no': 'E',
    'na': 'F' 
    # column not adressed stays the same
},
           inplace=True)
           
df1

Unnamed: 0,A,B,AB,C,D,E,F,Country
8,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
9,12.0,12.0,12.0,,7.0,21.0,2.0,USA
10,,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
11,12.0,5.0,4.0,Steam,,,7.0,Bosnia
12,90.0,5.0,,Steam,,4.0,6.0,Siberia
13,12.0,,12.0,7,,56.0,,Croatia


In [879]:
# mass renaming of index, works also columnwise
df1.rename(index=lambda x: x + 10, inplace=True)
df1

Unnamed: 0,A,B,AB,C,D,E,F,Country
18,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
19,12.0,12.0,12.0,,7.0,21.0,2.0,USA
20,,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
21,12.0,5.0,4.0,Steam,,,7.0,Bosnia
22,90.0,5.0,,Steam,,4.0,6.0,Siberia
23,12.0,,12.0,7,,56.0,,Croatia


## reset_index()

In [880]:
# use the drop parameter to avoid the old index being added as a column
df1.reset_index(drop=True, inplace=True)
df1

Unnamed: 0,A,B,AB,C,D,E,F,Country
0,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0,Canada
1,12.0,12.0,12.0,,7.0,21.0,2.0,USA
2,,12.0,100.0,Dampf,7.0,21.0,2.0,Germany
3,12.0,5.0,4.0,Steam,,,7.0,Bosnia
4,90.0,5.0,,Steam,,4.0,6.0,Siberia
5,12.0,,12.0,7,,56.0,,Croatia


## swap columns

In [881]:
df1.loc[:, ['F', 'A']] = df1[['A', 'F']].to_numpy()
df1

Unnamed: 0,A,B,AB,C,D,E,F,Country
0,2.0,19.0,1710.0,Dampf,7.0,21.0,90.0,Canada
1,2.0,12.0,12.0,,7.0,21.0,12.0,USA
2,2.0,12.0,100.0,Dampf,7.0,21.0,,Germany
3,7.0,5.0,4.0,Steam,,,12.0,Bosnia
4,6.0,5.0,,Steam,,4.0,90.0,Siberia
5,,,12.0,7,,56.0,12.0,Croatia


In [883]:
# fix column names
df1.columns = ['F', 'B', 'AB', 'C', 'D', 'E', 'A', 'Country']
df1

Unnamed: 0,F,B,AB,C,D,E,A,Country
0,2.0,19.0,1710.0,Dampf,7.0,21.0,90.0,Canada
1,2.0,12.0,12.0,,7.0,21.0,12.0,USA
2,2.0,12.0,100.0,Dampf,7.0,21.0,,Germany
3,7.0,5.0,4.0,Steam,,,12.0,Bosnia
4,6.0,5.0,,Steam,,4.0,90.0,Siberia
5,,,12.0,7,,56.0,12.0,Croatia


## rearrange columns

In [884]:
df1 = df1[['Country' ,'A' ,'B' ,'AB' ,'C' ,'D' ,'E' ,'F']]
df1

Unnamed: 0,Country,A,B,AB,C,D,E,F
0,Canada,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0
1,USA,12.0,12.0,12.0,,7.0,21.0,2.0
2,Germany,,12.0,100.0,Dampf,7.0,21.0,2.0
3,Bosnia,12.0,5.0,4.0,Steam,,,7.0
4,Siberia,90.0,5.0,,Steam,,4.0,6.0
5,Croatia,12.0,,12.0,7,,56.0,


## sort_values

In [885]:
# sorting row, with possibility to sort by multiple columns and descending
df1.sort_values(by=['A', 'B'], ascending=False, na_position='last', inplace=True)
df1.reset_index(drop=True) # if you don't like the mixed up index

Unnamed: 0,Country,A,B,AB,C,D,E,F
0,Canada,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0
1,Siberia,90.0,5.0,,Steam,,4.0,6.0
2,USA,12.0,12.0,12.0,,7.0,21.0,2.0
3,Bosnia,12.0,5.0,4.0,Steam,,,7.0
4,Croatia,12.0,,12.0,7,,56.0,
5,Germany,,12.0,100.0,Dampf,7.0,21.0,2.0


## set_index()

In [886]:
df1 = df1.set_index('Country', drop=True)
df1

Unnamed: 0_level_0,A,B,AB,C,D,E,F
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Canada,90.0,19.0,1710.0,Dampf,7.0,21.0,2.0
Siberia,90.0,5.0,,Steam,,4.0,6.0
USA,12.0,12.0,12.0,,7.0,21.0,2.0
Bosnia,12.0,5.0,4.0,Steam,,,7.0
Croatia,12.0,,12.0,7,,56.0,
Germany,,12.0,100.0,Dampf,7.0,21.0,2.0


# Operations on columns

In [888]:
df1['A'] = df1['A'] / 2
df1['A-C'] = df1['A'] - df1['E']
df1['G'] = ((df1['A-C'] / df1['E']) + df1['B']).round(1)
df1

Unnamed: 0_level_0,A,B,AB,C,D,E,F,A-C,G
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Canada,22.5,19.0,1710.0,Dampf,7.0,21.0,2.0,1.5,19.1
Siberia,22.5,5.0,,Steam,,4.0,6.0,18.5,9.6
USA,3.0,12.0,12.0,,7.0,21.0,2.0,-18.0,11.1
Bosnia,3.0,5.0,4.0,Steam,,,7.0,,
Croatia,3.0,,12.0,7,,56.0,,-53.0,
Germany,,12.0,100.0,Dampf,7.0,21.0,2.0,,


## If / Then
- In Excel this is done with a conditional formular: =IF(A2 < 10, "low", "high")
- np.where(condition, [x, y, ]/) - Return elements chosen from x or y depending on condition.

In [890]:
df1['E_level'] = np.where(df1['E']>20, 'high', 'low')
df1

Unnamed: 0_level_0,A,B,AB,C,D,E,F,A-C,G,E_level
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Canada,22.5,19.0,1710.0,Dampf,7.0,21.0,2.0,1.5,19.1,high
Siberia,22.5,5.0,,Steam,,4.0,6.0,18.5,9.6,low
USA,3.0,12.0,12.0,,7.0,21.0,2.0,-18.0,11.1,high
Bosnia,3.0,5.0,4.0,Steam,,,7.0,,,low
Croatia,3.0,,12.0,7,,56.0,,-53.0,,high
Germany,,12.0,100.0,Dampf,7.0,21.0,2.0,,,high


# Strings

In [1050]:
df1.C.str.len() # lenght of strings in column C
df1.C.str.rstrip().str.len()  # use rstrip() to exclude trailing blanks
df1.C.str.find( 'ea' )  # returns the position of substring, first character being 1, if not found, returns -1
df1.C.str[1:4]  # extracting substring by position
df1.at['Canda', 'C'] = 'Dampf auf dem Kessel'
df1.at['USA', 'C'] = 'Too much is less than nothing'
df1.C.str.split(" ", expand=True)[3]  # Extracting nth word
df1.C.str.upper()
df1.C.str.title()
df1.C.str.lower()
df1.C.str.replace(' ', '_')
df1.C.str.split('_')

Country
Canada                             [Dampf]
Siberia                            [Steam]
USA        [Too much is less than nothing]
Bosnia                             [Steam]
Croatia                                NaN
Germany                            [Dampf]
Canda               [Dampf auf dem Kessel]
Name: C, dtype: object

# read data
- **pd.read_csv(filename)**                              -- From a CSV file
- **pd.read_table(filename)**                          -- From a delimited text file (like TSV)
- **pd.read_excel(filename**)                          -- From an Excel file
- **pd.read_sql(query, connection_object)**        -- Read from a SQL table/database
- **pd.read_json(json_string**)                        -- Read from a JSON formatted string, URL or file.
- **pd.read_html(url**)                                      -- Parses an html URL, string or file and extracts tables to a list of dataframes
- **pd.read_clipboard()**                                  -- Takes the contents of your clipboard and passes it to read_tab()
- **pd.DataFrame(dict)**                                   -- From a dict, keys for columns names, values for data as lists

# Exporting Data
Use these commands to export a DataFrame to CSV, .xlsx, SQL, or JSON.
- **df.to_csv(filename)**                            --  Write to a CSV file
- **df.to_excel(filename**)                          --  Write to an Excel file
- **df.to_sql(table_name, connection_object)**       --  Write to a SQL table
- **df.to_json(filename)**                           --  Write to a file in JSON format

# Showing Data
- option_context() - show only within the context manager scope
- set_option() - Permanently changes the pandas settings
- pd.reset_option(‘all’) - resets all the changes.

In [None]:
# Default value of display.max_rows is 10 i.e. at max 10 rows will be printed.
# Set it None to display all rows in the dataframe
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# All dataframes hereafter reflect these changes.
who

In [None]:
# Resets the options
pd.reset_option('all')

# Inspecting Data
Use these commands to take a look at specific sections of your pandas DataFrame or Series.
- **df.head(3))**                        -- First n rows of the DataFrame
- **df.tail(3)**                       -- Last n rows of the DataFrame
- **df.shape)**                           -- Number of rows and columns
- **df.info()**                         -- Index, Datatype and Memory information
- **df.describe()**                      -- Summary statistics for numerical columns
- **df.value_counts(dropna=False)**       -- View unique values and counts
- **df.apply(pd.Series.value_counts)**   -- Unique values and counts for all columns

In [938]:
who = pd.read_csv('data/WHO.csv')
who.shape  # dimensionality of the DataFrame

(202, 358)

In [1156]:
who.index

RangeIndex(start=0, stop=202, step=1)

In [1026]:
selection = who[[
    'Country', 'Continent', 'Total_CO2_emissions', 'Income_per_person', 'Population_total',
    'Life expectancy at birth (years) both sexes',
    'Personal_computers_per_100_people', 'Patent_applications',
    'Military_expenditure', 'Inequality_index'
]]

In [1027]:
selection.set_index('Country', drop=True, inplace=True)
selection = selection.sort_values('Income_per_person', ascending=False)

## show overview

In [None]:
selection[::10] # show every 10th row
selection[::-20] # show every 10th row from the bottom up

## filtering -- boolean indexing

In [1011]:
selection[selection.Inequality_index>60]

Unnamed: 0_level_0,Total_CO2_emissions,Income_per_person,Population_total,Life expectancy at birth (years) both sexes,Personal_computers_per_100_people,Patent_applications,Military_expenditure,Inequality_index
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Botswana,4554.35,12057.0,1640115.0,52.0,4.7,10.0,3.0,60.51
Namibia,2553.81,4547.0,2030692.0,61.0,10.9,133.0,3.01,74.33
Bolivia,9251.6,3618.0,8857870.0,66.0,2.3,123.0,1.62,60.05
Lesotho,,1415.0,2031348.0,42.0,,177309.0,2.36,63.2
Central African Republic,252.82,675.0,,48.0,0.3,,1.12,61.33


# Statistics

In [None]:
# statistics of all seperate columns
who.describe().round()

Unnamed: 0,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total,Population annual growth rate (%),Population in urban areas (%),...,Total_CO2_emissions,Total_income,Total_reserves,Trade_balance_goods_and_services,Under_five_mortality_from_CME,Under_five_mortality_from_IHME,Under_five_mortality_rate,Urban_population,Urban_population_growth,Urban_population_pct_of_total
count,202.0,202.0,177.0,131.0,178.0,179.0,179.0,193.0,193.0,193.0,...,186.0,178.0,128.0,171.0,181.0,170.0,181.0,188.0,188.0,188.0
mean,102.0,4.0,59.0,79.0,11250.0,84.0,86.0,34098.0,1.0,55.0,...,148360.0,201556700000.0,57.0,342401200.0,57.0,54.0,57.0,16657627.0,2.0,55.0
std,58.0,2.0,49.0,20.0,12587.0,18.0,15.0,130496.0,1.0,24.0,...,613309.0,940068900000.0,139.0,59430430000.0,60.0,61.0,60.0,50948666.0,2.0,24.0
min,1.0,1.0,0.0,24.0,260.0,6.0,11.0,2.0,-2.0,10.0,...,26.0,51900000.0,1.0,-714000000000.0,3.0,3.0,3.0,15456.0,-1.0,10.0
25%,51.0,2.0,19.0,68.0,2112.0,79.0,80.0,1340.0,0.0,36.0,...,1673.0,3317500000.0,16.0,-1210000000.0,12.0,8.0,12.0,917162.0,1.0,36.0
50%,102.0,3.0,46.0,86.0,6175.0,90.0,90.0,6762.0,1.0,57.0,...,10212.0,11450000000.0,29.0,-224000000.0,30.0,28.0,30.0,3427661.0,2.0,57.0
75%,152.0,5.0,91.0,95.0,14502.0,96.0,96.0,21732.0,2.0,73.0,...,65492.0,86800000000.0,55.0,1024000000.0,89.0,83.0,89.0,9837113.0,3.0,73.0
max,202.0,7.0,199.0,100.0,60870.0,100.0,100.0,1328474.0,4.0,100.0,...,5776432.0,11000000000000.0,1335.0,139000000000.0,267.0,254.0,267.0,527000000.0,8.0,100.0


In [1048]:
selection.loc[:,selection.columns !='Country'].min() # min of each column, exclude string columns
selection.mean().round() # mean of each column
selection.count()  # number of non-null values in each column
selection.median()  # median in each column
selection.std()  # std. each column


Continent                                      1.808263e+00
Total_CO2_emissions                            6.133091e+05
Income_per_person                              1.379571e+04
Population_total                               1.316014e+08
Life expectancy at birth (years) both sexes    1.081280e+01
Personal_computers_per_100_people              2.011835e+01
Patent_applications                            8.925360e+04
Military_expenditure                           2.544794e+00
Inequality_index                               9.436038e+00
dtype: float64

In [1037]:
# .crr() -- returns the correlation between columns 
selection.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Continent,Total_CO2_emissions,Income_per_person,Population_total,Life expectancy at birth (years) both sexes,Personal_computers_per_100_people,Patent_applications,Military_expenditure,Inequality_index
Continent,1.0,0.117823,-0.144025,0.254264,-0.001525,-0.1496,-0.26663,-0.192057,0.333504
Total_CO2_emissions,0.117823,1.0,0.136232,0.726063,0.152386,0.215314,0.345873,0.037848,-0.022499
Income_per_person,-0.144025,0.136232,1.0,-0.066067,0.640906,0.758896,0.406131,-0.032544,-0.427825
Population_total,0.254264,0.726063,-0.066067,1.0,0.031341,-0.037336,0.131642,-0.006375,0.004683
Life expectancy at birth (years) both sexes,-0.001525,0.152386,0.640906,0.031341,1.0,0.61878,0.21119,-0.065709,-0.392824
Personal_computers_per_100_people,-0.1496,0.215314,0.758896,-0.037336,0.61878,1.0,0.522174,-0.026727,-0.426863
Patent_applications,-0.26663,0.345873,0.406131,0.131642,0.21119,0.522174,1.0,-0.109318,-0.464967
Military_expenditure,-0.192057,0.037848,-0.032544,-0.006375,-0.065709,-0.026727,-0.109318,1.0,-0.030972
Inequality_index,0.333504,-0.022499,-0.427825,0.004683,-0.392824,-0.426863,-0.464967,-0.030972,1.0


In [1031]:
selection.groupby('Continent').mean()

Unnamed: 0_level_0,Total_CO2_emissions,Income_per_person,Population_total,Life expectancy at birth (years) both sexes,Personal_computers_per_100_people,Patent_applications,Military_expenditure,Inequality_index
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,81536.911905,15227.095238,9900521.0,69.75,12.956,32930.307692,4.650556,39.79
2,142921.648043,19384.76087,16378520.0,74.882353,30.0625,154830.413043,1.765435,32.825455
3,17422.001087,2922.625833,16554070.0,53.125,2.597073,91299.625,2.544673,46.030313
4,962841.088571,30203.255714,72535490.0,75.833333,39.07,119807.333333,1.424,39.806667
5,32523.007,8224.390323,15843170.0,71.655172,8.543704,32415.711538,1.318421,50.893182
6,135223.585556,14636.055862,30505190.0,69.733333,18.93875,77726.388889,1.706471,38.870714
7,793353.285556,2623.0,305553500.0,66.0,2.533333,60731.5,2.358571,39.59


# merge
In Excel, there are merging of tables can be done through a VLOOKUP.<br>
Pandas DataFrames have a merge() method, which provides similar functionality. <br>
The data does not have to be sorted ahead of time, and different join types are <br>
accomplished via the **how** keyword.<br>

Merge has a number of advantages over VLOOKUP:

- The lookup value doesn’t need to be the first column of the lookup table
- If multiple rows are matched, there will be one row for each match, instead of just the first
- It will include all columns from the lookup table, instead of just a single specified column
- It supports more complex join operations


In [1051]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

Unnamed: 0,key,value
0,A,-0.519436
1,B,0.205313
2,C,0.245516
3,D,-0.33816


In [1053]:
df2

Unnamed: 0,key,value
0,B,2.224028
1,D,-0.907464
2,D,0.803461
3,E,-0.084423


In [1052]:
inner_join = df1.merge(df2, on=["key"], how="inner") # only where both match
inner_join

Unnamed: 0,key,value_x,value_y
0,B,0.205313,2.224028
1,D,-0.33816,-0.907464
2,D,-0.33816,0.803461


In [1054]:
left_join = df1.merge(df2, on=["key"], how="left") # all rows from the left table
left_join

Unnamed: 0,key,value_x,value_y
0,A,-0.519436,
1,B,0.205313,2.224028
2,C,0.245516,
3,D,-0.33816,-0.907464
4,D,-0.33816,0.803461


In [1055]:
right_join = df1.merge(df2, on=["key"], how="right") # all from the right
right_join

Unnamed: 0,key,value_x,value_y
0,B,0.205313,2.224028
1,D,-0.33816,-0.907464
2,D,-0.33816,0.803461
3,E,,-0.084423


In [1056]:
outer_join = df1.merge(df2, on=["key"], how="outer") # all rows from both tables
outer_join

Unnamed: 0,key,value_x,value_y
0,A,-0.519436,
1,B,0.205313,2.224028
2,C,0.245516,
3,D,-0.33816,-0.907464
4,D,-0.33816,0.803461
5,E,,-0.084423


# fill handling
Create a series of numbers following a set pattern in a certain set of cells. <br>
In a spreadsheet, this would be done by shift+drag after entering the first number <br>
or by entering the first two or three values and then dragging.<br>

This can be achieved by creating a series and assigning it to the desired cells.

In [1058]:
df = pd.DataFrame({"AAA": [1] * 8, "BBB": list(range(0, 8))})
df

Unnamed: 0,AAA,BBB
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7


In [1062]:
series = list(range(11, 15))
df.loc[2:5, "AAA"] = series
df

Unnamed: 0,AAA,BBB
0,1,0
1,1,1
2,11,2
3,12,3
4,13,4
5,14,5
6,1,6
7,1,7


# pivot tables

!['pivot'](data/images/reshaping_pivot.png)

In [None]:
# function for grouping countries by ineqality index
def inequality_groups(x):
    if x <= 20:
        return '0-20'
    elif 20 < x <=40:
        return '20-40'
    elif 40 < x <=60:
        return '40-60'
    elif 60 < x <=80:
        return '60-80'
    elif 80 < x <=100:
        return '80-100'
    else: np.nan

# apply function row by row
selection['inequality_group'] = selection['Inequality_index'].apply(lambda row: inequality_groups(row))

# rearrange columns
selection = selection[[
    'Continent',
    'Total_CO2_emissions',
    'Income_per_person',
    'Inequality_index',
    'inequality_group',
    'Population_total',
    'Life expectancy at birth (years) both sexes',
    'Personal_computers_per_100_people',
    'Patent_applications',
    'Military_expenditure',
]]

# rename columns
selection.columns = [
    'Continent_number',
    'Total_CO2_emissions',
    'Income_pc',
    'Inequality_index',
    'inequality_group',
    'pop_total',
    'life_expectancy',
    'computer_per_100_people',
    'Patent_applications',
    'Military_expenditure'
]


# dict map numbers to actual names
Continents = {
    1: "Eastern Mediterranean",
    2: "Europe",
    3: "Africa",
    4: "Americas",
    5: "South America",
    6: "South-East Asia",
    7: "South & Eastern Asia"
}

# map names to numbers and insert coumn at position 2
selection.insert(1, 'Continent', selection.Continent_number.map(Continents))
selection.head(3)

In [1101]:
piv = pd.pivot_table(selection,
               values=["Income_pc"],
               index=["Continent"],
               columns='inequality_group',
               aggfunc=np.average)


Unnamed: 0_level_0,Income_pc,Income_pc,Income_pc
inequality_group,20-40,40-60,60-80
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,1585.333333,1641.372632,4673.5
Americas,35078.0,26495.5,
Eastern Mediterranean,8023.571429,2831.0,
Europe,19903.853659,5179.333333,
South & Eastern Asia,1930.0,2526.5,
South America,15352.0,6674.8,3618.0
South-East Asia,14851.75,16695.5,


In [None]:
pd.melt()

# Dates
- In pandas, you need to explicitly convert plain text to datetime objects, <br>
either while reading from a CSV or once in a DataFrame.

In [1155]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

## duplicated

In [434]:
df1.index.duplicated()  # are there duplicate values in your original index?
df.loc[~df.index.duplicated(), :] # remove all duplicate indexes
df1.reindex()
df1.loc[:, ~df1.columns.duplicated(keep='first')]

## is_unique
df1.index.is_unique


## Selection, filter, sort & group
# Use these commands to select a specific subset of your data.

df.groupby(["A","B"])            # Returns groupby object for values from multiple columns
df.groupby("A")["B"]             # Returns the mean of the values in col2, grouped by the values in col1 (mean can be replaced with almost any function from the statistics module)



pd.melt()
pd.cut()  #  separate the array elements into different bins
pd.qcut()  # divide up the underlying data into equal-sized bins
pd.get_dummies()
pd.factorize( )  #get the numeric representation of an array by identifying distinct values
pd.unique()  #  get unique values of the Series
pd.to_datetime()#   convert the argument to datetime
pd.to_timedelta( )  # Timedeltas are absolute differences in times, expressed in different units
pd.date_range() # get a fixed frequency DatetimeIndex
pd.interval_range( start=0, end=5 )  #concatenate pandas objects along a particular axis with optional set logic along the other axes
pd.Index()  #selecting particular rows and columns of data from a DataFrame
pd.crosstab()  #cross-tabulation of two (or more) factors


array([False, False, False, False, False, False])

In [None]:

# Create a pivot table that groups by A and calculates the mean of B and C
print(df.pivot_table(index="A", values=["B", "C"], aggfunc=st.mean))

# Find the average across all columns for every unique col1 group
print(df.groupby("A").agg(np.mean))

print(df)
print(df.apply(np.mean))  # Apply the function np.mean() across each column
print(df.apply(np.max, axis=1))  # Apply the function np.max() across each row

df01 = pd.DataFrame(np.random.randint(0, 11, size=(4, 5)))
df02 = pd.DataFrame(np.random.randint(11, 21, size=(4, 5)))


In [11]:
df1=pd.DataFrame({'a':[3,5,6], 'b':[4,5,7]})
df2=pd.DataFrame([[3,4], [3,5], [3,8]], index=[10, 11, 1], columns=['a', 'b'])
df2

Unnamed: 0,a,b
10,3,4
11,3,5
1,3,8


## concat
- When we concatenated our DataFrames we simply added them to each other <br>
i.e. stacked them either vertically or side by side.

In [48]:
pd.concat([df01, df02], axis=0, ignore_index = True) # If ignore_index = True the index of df will be in a continuous order. 
pd.concat([df01, df02], axis=0) # Add the columns in df01 to the end of df02 (rows should be identical)
pd.concat([df01, df02], axis=1) # stacked side-by-side

Unnamed: 0,a,b,a.1,b.1
0,3.0,4.0,,
1,5.0,5.0,3.0,8.0
2,6.0,7.0,,
10,,,3.0,4.0
11,,,3.0,5.0


##  join 
- Join columns with other DataFrame either on **index** . 
- By default pandas **join() method doesn’t support joining DataFrames on columns**, <br>
but you can do this by converting the column you wish to join to index. 
- In order to join on columns, **the better approach would be using merge()**.
- 'how' can be one of 'left', 'right', 'outer', 'inner'

In [None]:
df1 = pd.DataFrame({'name': ['Paul', 'Sarah', 'Lisa'], 'income': [4000, 5000, 7000]})
df2 = pd.DataFrame([['Sarah', 25], ['Lisa', 39], ['Paul', 28], ['Jean', 30]], columns=['name', 'age'])

In [63]:
df1.set_index('name').join( df2.set_index('name'))

Unnamed: 0_level_0,income,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Paul,4000,28
Sarah,5000,25
Lisa,7000,39


## merge
- The join is done on columns or indexes. 
- If joining columns on columns, the DataFrame indexes will be ignored. 
- Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. 
- When performing a cross merge, no column specifications to merge on are allowed.

In [None]:
merged_df = pd.merge(df,df1) # Merge on common columns
df2 = pd.merge(df, df1, on=['Courses','Fee']) # Use pandas.merge() on multiple columns
df2 = pd.merge(df, df1,  how='left', left_on=['Courses','Fee'], right_on = ['Courses','Fee']) # Use pandas.merge() on multiple columns
merged_df = pd.merge(df, df1, left_on="Courses", right_on="Courses") # Merge Pandas DataFrames using left_on and right_on
merged_df = pd.merge(df, df1, on="Courses") # Set value of on parameter to specify the key value for merge in pandas

In [42]:
df1.merge(df2, on='name' ,how='outer')

Unnamed: 0,name,income,age
0,Paul,4000.0,28
1,Sarah,5000.0,25
2,Lisa,7000.0,39
3,Jean,,30


In [None]:
df.dtypes    # shows the datatypes in the df also df.info, df.columns

Country                            object
CountryID                           int64
Continent                           int64
Adolescent fertility rate (%)     float64
Adult literacy rate (%)           float64
                                   ...   
Under_five_mortality_from_IHME    float64
Under_five_mortality_rate         float64
Urban_population                  float64
Urban_population_growth           float64
Urban_population_pct_of_total     float64
Length: 358, dtype: object

### filter columns

In [None]:
df.Continent.unique() # uniques in a col

array([1, 2, 3, 4, 5, 6, 7])

## Reshaping Data – Change the layout of a data set

### melt

In [None]:
pd.melt(df2)  # Gather columns into rows.

Unnamed: 0,variable,value
0,a,1
1,a,4
2,a,7
3,b,2
4,b,5
5,b,8
6,c,3
7,c,6
8,c,9


### concat

In [None]:
df4 = pd.concat([df1,df2])    # Append rows of DataFrames
df4

Unnamed: 0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9
1,1,2,3
2,4,5,6
3,7,8,9


### sort_values

### drop_duplicates

In [None]:
df5.drop_duplicates()

Unnamed: 0,a,col2,c
1,1,2,3
2,4,5,6
3,7,8,9


In [None]:
state2 = state.copy()
state2

Unnamed: 0_level_0,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total,Population annual growth rate (%),Population in urban areas (%),...,Total_income,Total_reserves,Trade_balance_goods_and_services,Under_five_mortality_from_CME,Under_five_mortality_from_IHME,Under_five_mortality_rate,Urban_population,Urban_population_growth,Urban_population_pct_of_total,Average poulation growth rate in %
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1,1,151.0,28.0,,,,26088.0,4.0,23.0,...,,,,257.00,231.9,257.00,5740436.0,5.44,22.9,
Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0,0.6,46.0,...,4.790000e+09,78.14,-2.040000e+09,18.47,15.5,18.47,1431793.9,2.21,45.4,
Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0,1.5,64.0,...,6.970000e+10,351.36,4.700000e+09,40.00,31.2,40.00,20800000.0,2.61,63.3,
Andorra,4,2,,,,83.0,83.0,74.0,1.0,93.0,...,,,,,,,,,,
Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0,2.8,54.0,...,1.490000e+10,27.13,9.140000e+09,164.10,242.5,164.10,8578749.0,4.14,53.3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,198,6,25.0,90.3,2310.0,91.0,96.0,86206.0,1.4,27.0,...,4.480000e+10,47.11,-1.940000e+09,20.20,23.4,20.20,21900000.0,2.90,26.4,
West Bank and Gaza,199,1,,,,,,,,,...,3.780000e+09,,,28.00,25.8,28.00,2596216.0,3.33,71.6,
Yemen,200,1,83.0,54.1,2090.0,65.0,85.0,21732.0,3.0,28.0,...,1.150000e+10,114.52,8.310000e+08,82.40,87.9,82.40,5759120.5,4.37,27.3,
Zambia,201,3,161.0,68.0,1140.0,94.0,90.0,11696.0,1.9,35.0,...,4.090000e+09,10.41,-4.470000e+08,175.30,163.8,175.30,4017411.0,1.95,35.0,


In [None]:
state2["Continent"].unique()

array([1, 2, 3, 4, 5, 6, 7])

### group_by

In [None]:
pop_growth_continent = state2["Population annual growth rate (%)"].groupby("Continent").mean()
pop_growth_continent = pd.DataFrame(pop_growth_continent)
pop_growth_continent
# todo rename Continents

Unnamed: 0_level_0,Population annual growth rate (%)
Continent,Unnamed: 1_level_1
1,2.19
2,0.282353
3,2.277083
4,0.95
5,1.134483
6,1.033333
7,1.488889


## date range

In [None]:
import pandas as pd
dates = pd.date_range("20220227", periods=10)
print(pd.DataFrame(dates))

           0
0 2022-02-27
1 2022-02-28
2 2022-03-01
3 2022-03-02
4 2022-03-03
5 2022-03-04
6 2022-03-05
7 2022-03-06
8 2022-03-07
9 2022-03-08


## to_numpy

In [None]:
dates.to_numpy()

array(['2022-02-27T00:00:00.000000000', '2022-02-28T00:00:00.000000000',
       '2022-03-01T00:00:00.000000000', '2022-03-02T00:00:00.000000000',
       '2022-03-03T00:00:00.000000000', '2022-03-04T00:00:00.000000000',
       '2022-03-05T00:00:00.000000000', '2022-03-06T00:00:00.000000000',
       '2022-03-07T00:00:00.000000000', '2022-03-08T00:00:00.000000000'],
      dtype='datetime64[ns]')