In [1]:
import numpy as np
import pandas as pd

In [2]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
df = pd.DataFrame({"A":np.arange(10), "B":np.arange(10)[::-1], "C":np.random.random(10)*10})
df

Unnamed: 0,A,B,C
0,0,9,6.211753
1,1,8,0.360443
2,2,7,9.801098
3,3,6,0.267708
4,4,5,3.946636
5,5,4,4.248331
6,6,3,8.507027
7,7,2,7.827999
8,8,1,2.130766
9,9,0,8.031605


### SELECT ROWS OF A DF WHERE A COLUMN VALUE IS BETWEEN A AND B : 


In [None]:
df.loc[ (df['column_name'] >= A) & (df['column_name'] <= B)  ]

### USAGE OF MASK TO SELECT IN A DF 

In [7]:
mask = df['A'] == 1  # MASK  is a BOOLEAN CONDITION

In [8]:
# we use LOC to see the df with the mask
df.loc[mask]


Unnamed: 0,A,B,C
1,1,8,0.360443


In [10]:
# We can set all the values in the mask equals to 0
df.loc[mask] = 0
df.loc[mask]

Unnamed: 0,A,B,C
1,0,0,0.0


In [12]:
# We can create a mask to see all the values between 0 and 5:
mask = df['A'].isin(range(5))
df.loc[mask]

Unnamed: 0,A,B,C
0,0,9,6.211753
1,0,0,0.0
2,2,7,9.801098
3,3,6,0.267708
4,4,5,3.946636


### Use COPY 

In [14]:
df1 = pd.DataFrame({ 'A':[0]*3, 'B': [1]*3})
df2 = df1
df2

Unnamed: 0,A,B
0,0,1
1,0,1
2,0,1


We see that the changes were applied to df1 too, because we said that df2 = df1. To AVOID this, we need to COPY

In [15]:
df2['A'] = df2['A'] + 5
df1

Unnamed: 0,A,B
0,5,1
1,5,1
2,5,1


In [None]:
df2 = df1.copy()
# OR 
from copy import deepcopy
df2 = deepcopy(df1)

### Use MAP : easy change of old values in a df to new values

In [21]:
df = pd.DataFrame({ 'A':[0]*3, 'B': [1]*3})
level_map = {0:'low', 1:'high'}    ### the format of the level map is {old value : new value}
# We apply the map to the df by creating a new col called 'levels'
df_levels = df.copy()
df_levels['levels'] = df_levels['A'].map(level_map)
df_levels

Unnamed: 0,A,B,levels
0,0,1,low
1,0,1,low
2,0,1,low


### Use APPLY to apply a function to several cols to create new col 

In [30]:
def fonction(x,y):
    return x**2+y**2+5
df

Unnamed: 0,A,B,new
0,0,1,1
1,0,1,1
2,0,1,1


In [33]:
df['new'] = df.apply(lambda x:fonction(x['A'], x['B']), axis = 1)
df

Unnamed: 0,A,B,new
0,0,1,6
1,0,1,6
2,0,1,6


### Check value distribution using VALUE COUNTS


In [35]:
df = pd.DataFrame({"A":np.arange(10), "B":np.arange(10)[::-1]})
df['A'].value_counts()

9    1
8    1
7    1
6    1
5    1
4    1
3    1
2    1
1    1
0    1
Name: A, dtype: int64

In [36]:
df['A'].value_counts(normalize=True, dropna=True)

9    0.1
8    0.1
7    0.1
6    0.1
5    0.1
4    0.1
3    0.1
2    0.1
1    0.1
0    0.1
Name: A, dtype: float64

### Drop columns in DataFrame

In [37]:
df = pd.DataFrame({"A":np.arange(10), "B":np.arange(10)[::-1], 'C':np.random.random(10)*10})
df

Unnamed: 0,A,B,C
0,0,9,4.993613
1,1,8,1.541286
2,2,7,4.719102
3,3,6,7.294585
4,4,5,8.842433
5,5,4,8.344256
6,6,3,7.261773
7,7,2,5.793598
8,8,1,0.638671
9,9,0,0.067508


In [38]:
to_drop = ['A', 'C']
df.drop(to_drop, inplace=True, axis=1)   
# inplace = True because we want the changes to be made directly in our object 
# axis = 1 because we look for the values to be dropped in the columns 
df

Unnamed: 0,B
0,9
1,8
2,7
3,6
4,5
5,4
6,3
7,2
8,1
9,0


In [41]:
df = pd.DataFrame({"A":np.arange(10), "B":np.arange(10)[::-1], 'C':np.random.random(10)*10})
# We can drop directly columns : 
df.drop(columns=to_drop, inplace=True)
df

Unnamed: 0,B
0,9
1,8
2,7
3,6
4,5
5,4
6,3
7,2
8,1
9,0


### Index work on dataframe

In [52]:
df = pd.DataFrame({"Index":list("abcdefghij"), "B":np.arange(10)[::-1], 'C':np.random.random(10)*10})
df['Index'].is_unique

True

In [53]:
df

Unnamed: 0,Index,B,C
0,a,9,7.818764
1,b,8,1.746344
2,c,7,6.310981
3,d,6,1.036345
4,e,5,5.801473
5,f,4,1.36059
6,g,3,7.143132
7,h,2,1.431883
8,i,1,1.657983
9,j,0,1.77307


In [54]:
# We now replace the existing index with this column
df = df.set_index('Index')
df.head()
## Now we see that the Index is now indexing rows, and it's no more a column

Unnamed: 0_level_0,B,C
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
a,9,7.818764
b,8,1.746344
c,7,6.310981
d,6,1.036345
e,5,5.801473


In [56]:
df.loc['c',:]

B    7.000000
C    6.310981
Name: c, dtype: float64

In [57]:
df.iloc[2,:]

B    7.000000
C    6.310981
Name: c, dtype: float64

### Basic DF infos

In [58]:
df.shape

(10, 2)

In [59]:
df.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object', name='Index')

In [60]:
df.columns

Index(['B', 'C'], dtype='object')

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 2 columns):
B    10 non-null int64
C    10 non-null float64
dtypes: float64(1), int64(1)
memory usage: 560.0+ bytes


In [62]:
df.count()

B    10
C    10
dtype: int64

In [63]:
df.sum()

B    45.000000
C    36.080565
dtype: float64

In [64]:
df.describe()

Unnamed: 0,B,C
count,10.0,10.0
mean,4.5,3.608056
std,3.02765,2.776492
min,0.0,1.036345
25%,2.25,1.488408
50%,4.5,1.759707
75%,6.75,6.183604
max,9.0,7.818764
