In [52]:
import pandas as pd
import numpy as np

# CREATING DATAFRAME

### 1] Using List

In [2]:
lst=['Anushka','Bala','Cat','Dog']
df=pd.DataFrame(lst)

In [3]:
df

Unnamed: 0,0
0,Anushka
1,Bala
2,Cat
3,Dog


### 2] Using Dict of Ndarray / List 

In [5]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu'],
    'Age':[12,13,10,12]
}

In [6]:
df_from_dic=pd.DataFrame(dic)

In [7]:
df_from_dic

Unnamed: 0,Names,Age
0,Naveen,12
1,Kumar,13
2,Anu,10
3,Renu,12


# Indexing

### 1] Character indexing

In [21]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu']
}

In [22]:
df=pd.DataFrame(dic,index=['A','B','C','D'])

In [23]:
df

Unnamed: 0,Names
A,Naveen
B,Kumar
C,Anu
D,Renu


### 2] Number indexing

In [38]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu']
}

In [39]:
df=pd.DataFrame(dic,index=[1,2,3,4])

In [40]:
df

Unnamed: 0,Names
1,Naveen
2,Kumar
3,Anu
4,Renu


### 3] Boolean indexing

In [41]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu']
}

In [42]:
df=pd.DataFrame(dic,index=[True,False,True,False])

In [43]:
df

Unnamed: 0,Names
True,Naveen
False,Kumar
True,Anu
False,Renu


### 4] set_index()

In [44]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu'],
    'Age':[12,13,10,12],
    'Gender':['M','M','F','F']
}

In [48]:
df=pd.DataFrame(dic)

In [49]:
df.set_index('Names',inplace=True)

In [50]:
df

Unnamed: 0_level_0,Age,Gender
Names,Unnamed: 1_level_1,Unnamed: 2_level_1
Naveen,12,M
Kumar,13,M
Anu,10,F
Renu,12,F


# Dealing with rows and columns

### 1] Column selection using their names

In [10]:
dic={
    'Names':['Naveen','Kumar','Anu','Renu'],
    'Age':[12,13,10,12],
    'Gender':['M','M','F','F']
}

In [11]:
df=pd.DataFrame(dic)

In [12]:
df

Unnamed: 0,Names,Age,Gender
0,Naveen,12,M
1,Kumar,13,M
2,Anu,10,F
3,Renu,12,F


In [16]:
#selecting n number of columns
df[['Names','Age']]

Unnamed: 0,Names,Age
0,Naveen,12
1,Kumar,13
2,Anu,10
3,Renu,12


In [17]:
#selecting only one column
df[['Age']]

Unnamed: 0,Age
0,12
1,13
2,10
3,12


### 2] Row selection

In [19]:
df.loc[2]

Names     Anu
Age        10
Gender      F
Name: 2, dtype: object

In [20]:
df.iloc[2]

Names     Anu
Age        10
Gender      F
Name: 2, dtype: object

#  loc vs iloc

### 1].loc

In [25]:
#sample data
data = {
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data, index=['a', 'b', 'c', 'd'])

In [28]:
# Select a single row by label
df.loc[['a']]

Unnamed: 0,A,B,C
a,1,5,9


In [29]:
#select multiple rows by label
df.loc[['a','b','d']]

Unnamed: 0,A,B,C
a,1,5,9
b,2,6,10
d,4,8,12


In [30]:
#select with range
df.loc['a':'c','B':'C']

Unnamed: 0,B,C
a,5,9
b,6,10
c,7,11


### 2] iloc


In [32]:
# for same data sample

In [33]:
# Select a single row by integer pos
df.iloc[[1]]

Unnamed: 0,A,B,C
b,2,6,10


In [35]:
#select multiple rows by integer positions
df.iloc[[0,2,3]]

Unnamed: 0,A,B,C
a,1,5,9
c,3,7,11
d,4,8,12


In [37]:
#select with range
df.iloc[0:2,0:2]

Unnamed: 0,A,B
a,1,5
b,2,6


# Working with missing data

### Checking for missing values

#### 1] isnull()

In [53]:
dic = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}

In [54]:
df=pd.DataFrame(dic)

In [57]:
df.isnull()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [63]:
df.isnull().sum()

First Score     1
Second Score    1
Third Score     1
dtype: int64

#### 2] notnull()

In [64]:
df.notnull()

Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [65]:
df.notnull().sum()

First Score     3
Second Score    3
Third Score     3
dtype: int64

### Filling the missing values

In [66]:
dic = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}

In [67]:
df=pd.DataFrame(dic)

In [68]:
df

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


#### 1] fillna() with specified value

In [69]:
df_0=df.fillna(0)

In [70]:
df_0

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


#### 2] fillna()  using backward fill method

In [71]:
df_back=df.fillna(method='bfill')

In [72]:
df_back

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,40.0
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,,98.0


#### 3] fillna()  using forward fill method

In [73]:
df_front=df.fillna(method='ffill')

In [74]:
df_front

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,90.0,56.0,80.0
3,95.0,56.0,98.0


#### 4] dropna() to drop the missing value records

In [85]:
df_dropped=df.dropna()

In [86]:
df_dropped

Unnamed: 0,First Score,Second Score,Third Score
1,90.0,45.0,40.0


### Mean Median Mode method for filling missing values

In [76]:
# Mean method for each column

In [77]:
df_mean=df.apply(lambda x:x.fillna(x.mean()),axis=0)

In [78]:
df_mean

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,72.666667
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,43.666667,98.0


In [79]:
# Median method for each column

In [80]:
df_median=df.apply(lambda x:x.fillna(x.median()),axis=0)

In [81]:
df_median

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,80.0
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,45.0,98.0


In [82]:
# Mode method for each column

In [83]:
df_mode=df.apply(lambda x:x.fillna(x.mode()[0]),axis=0)

In [84]:
df_mode

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,40.0
1,90.0,45.0,40.0
2,90.0,56.0,80.0
3,95.0,30.0,98.0
