In [1]:
import numpy as np
import pandas as pd

**Constructing DataFrame**
- DataFrame is a data structure constructed using pandas library

In [2]:
df = pd.DataFrame(
    data = (np.arange(0,25).reshape(5,5)),
    index=["Row 1","Row 2","Row 3","Row 4","Row 5"],
    columns = ["Col 1","Col 2","Col 3","Col 4","Col 5"]
)
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9
Row 3,10,11,12,13,14
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24


In [3]:
df.head() # by default top 5 records

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9
Row 3,10,11,12,13,14
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24


In [4]:
df.tail(2)

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row 1 to Row 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Col 1   5 non-null      int32
 1   Col 2   5 non-null      int32
 2   Col 3   5 non-null      int32
 3   Col 4   5 non-null      int32
 4   Col 5   5 non-null      int32
dtypes: int32(5)
memory usage: 140.0+ bytes


In [7]:
df.describe()

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
count,5.0,5.0,5.0,5.0,5.0
mean,10.0,11.0,12.0,13.0,14.0
std,7.905694,7.905694,7.905694,7.905694,7.905694
min,0.0,1.0,2.0,3.0,4.0
25%,5.0,6.0,7.0,8.0,9.0
50%,10.0,11.0,12.0,13.0,14.0
75%,15.0,16.0,17.0,18.0,19.0
max,20.0,21.0,22.0,23.0,24.0


**Indexing**
- using columnname
- using rowindex i.e.[loc]
- rowindex column index [iloc]

In [8]:
df.head()

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9
Row 3,10,11,12,13,14
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24


**Using column name**

In [9]:
df[['Col 1','Col 2','Col 3']]

Unnamed: 0,Col 1,Col 2,Col 3
Row 1,0,1,2
Row 2,5,6,7
Row 3,10,11,12
Row 4,15,16,17
Row 5,20,21,22


In [10]:
df['Col 1']

Row 1     0
Row 2     5
Row 3    10
Row 4    15
Row 5    20
Name: Col 1, dtype: int32

In [11]:
type(df['Col 1'])

pandas.core.series.Series

In [12]:
# If we take more than one row or more than one col of data => dataframe
# one row or one column => Series

**Using loc**

In [13]:
df.loc['Row 1']

Col 1    0
Col 2    1
Col 3    2
Col 4    3
Col 5    4
Name: Row 1, dtype: int32

In [14]:
df.loc[['Row 1','Row 2']]

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9


**Using iloc**
- df.iloc[row,column]

In [15]:
df.iloc[2:4,0:2]

Unnamed: 0,Col 1,Col 2
Row 3,10,11
Row 4,15,16


In [16]:
df.iloc[:,:]

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9
Row 3,10,11,12,13,14
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24


In [17]:
df.iloc[0:2,:]

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,0,1,2,3,4
Row 2,5,6,7,8,9


In [18]:
# Converts dataframe into arrays
df.iloc[:,:].values

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

**Operations**

In [19]:
df.isnull()

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 1,False,False,False,False,False
Row 2,False,False,False,False,False
Row 3,False,False,False,False,False
Row 4,False,False,False,False,False
Row 5,False,False,False,False,False


In [20]:
df.isnull().sum()

Col 1    0
Col 2    0
Col 3    0
Col 4    0
Col 5    0
dtype: int64

In [21]:
df1 = pd.DataFrame(\
                   data = [[1,np.nan,2],[1,3,4]],
                   index=["row 1","row2"],
                  columns = ["col 1","col 2","col 3"])
df1

Unnamed: 0,col 1,col 2,col 3
row 1,1,,2
row2,1,3.0,4


In [22]:
df1.isnull().sum()

col 1    0
col 2    1
col 3    0
dtype: int64

In [23]:
df1.isnull().sum()==0

col 1     True
col 2    False
col 3     True
dtype: bool

In [24]:
df1['col 1'].value_counts()

1    2
Name: col 1, dtype: int64

In [25]:
df1['col 3'].unique()

array([2, 4], dtype=int64)

In [26]:
df['Col 1'].unique()

array([ 0,  5, 10, 15, 20])

In [27]:
df['Col 2']>2

Row 1    False
Row 2     True
Row 3     True
Row 4     True
Row 5     True
Name: Col 2, dtype: bool

In [28]:
# Conditional indexing
df[df['Col 2']>2]

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5
Row 2,5,6,7,8,9
Row 3,10,11,12,13,14
Row 4,15,16,17,18,19
Row 5,20,21,22,23,24
