# Pandas DataFrame

## Creating a Pandas DataFrame

在实际使用中，DataFrame一般用数据库存储来创建

### Creating a dataframe using List

In [1]:
import pandas as pd
 
# list of strings
lst = ['Geeks', 'For', 'Geeks', 'is', 
            'portal', 'for', 'Geeks']
 
# Calling DataFrame constructor on list
df = pd.DataFrame(lst)
print(df)

        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks


### Creating DataFrame from dict of ndarray/lists

In [2]:
import pandas as pd
 
# intialise data of lists.
data = {'Name':['Tom', 'nick', 'krish', 'jack'],
        'Age':[20, 21, 19, 18]}
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Print the output.
print(df)

    Name  Age
0    Tom   20
1   nick   21
2  krish   19
3   jack   18


## Dealing with Rows and Columns

### Column Selection

In [12]:
import pandas as pd
 
# Define a dictionary containing employee data
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}
 
# Convert the dictionary into DataFrame 
df = pd.DataFrame(data)

print(df, "\n")
 
# select two columns
print(df[['Name', 'Qualification']])

     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannauj           Phd 

     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


### Row Selection

In [11]:
import pandas as pd
 
# making data frame from csv file
data = pd.read_csv("nba.csv", index_col ="Name")
 
# retrieving row by loc method
first = data.loc["Avery Bradley"]   # DataFrame.loc[] 用来retrieve row
second = data.loc["R.J. Hunter"]
 
 
print(first, "\n\n\n", second)

Team        Boston Celtics
Number                   0
Position                PG
Age                     25
Height                 6-2
Weight                 180
College              Texas
Salary         7.73034e+06
Name: Avery Bradley, dtype: object 


 Team        Boston Celtics
Number                  28
Position                SG
Age                     22
Height                 6-5
Weight                 185
College      Georgia State
Salary         1.14864e+06
Name: R.J. Hunter, dtype: object


## Working with Missing Data

### Checking for missing values using isnull() and notnull()

In [18]:
import pandas as pd
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from list
df = pd.DataFrame(dict)
 
# using notnull() function  
df.notnull()      # nan是false，其他是true    

Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [20]:
df.isnull()      # 和notnull相反

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


### Filling missing values using fillna(), replace() and interpolate()

In [21]:
import pandas as pd
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
 
# filling missing value using fillna()  
df.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


### Dropping missing values using dropna() 

In [26]:
import pandas as pd
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)

print(df, '\n')
df.dropna()      # 有nan的rows都会被drop掉

   First Score  Second Score  Third Score  Fourth Score
0        100.0          30.0           52           NaN
1         90.0           NaN           40           NaN
2          NaN          45.0           80           NaN
3         95.0          56.0           98          65.0 



Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


## Iterating over rows and columns

### Iterating over rows

In [27]:
import pandas as pd
  
# dictionary of lists
dict = {'name':["aparna", "pankaj", "sudhir", "Geeku"],
        'degree': ["MBA", "BCA", "M.Tech", "MBA"],
        'score':[90, 40, 80, 98]}
 
# creating a dataframe from a dictionary 
df = pd.DataFrame(dict)
 
df

Unnamed: 0,name,degree,score
0,aparna,MBA,90
1,pankaj,BCA,40
2,sudhir,M.Tech,80
3,Geeku,MBA,98


In [29]:
for i, j in df.iterrows():
    print(i, j, '\n')

0 name      aparna
degree       MBA
score         90
Name: 0, dtype: object 

1 name      pankaj
degree       BCA
score         40
Name: 1, dtype: object 

2 name      sudhir
degree    M.Tech
score         80
Name: 2, dtype: object 

3 name      Geeku
degree      MBA
score        98
Name: 3, dtype: object 



### Iterating over Columns

In [30]:
import pandas as pd
   
# dictionary of lists
dict = {'name':["aparna", "pankaj", "sudhir", "Geeku"],
        'degree': ["MBA", "BCA", "M.Tech", "MBA"],
        'score':[90, 40, 80, 98]}
  
# creating a dataframe from a dictionary 
df = pd.DataFrame(dict)
 
df

Unnamed: 0,name,degree,score
0,aparna,MBA,90
1,pankaj,BCA,40
2,sudhir,M.Tech,80
3,Geeku,MBA,98


In [32]:
columns = list(df)
print("columns = ", columns)
 
for i in columns:
    # 打印每一列的第三个元素，即第三行
    print (df[i][2])   

columns =  ['name', 'degree', 'score']
sudhir
M.Tech
80
