In [2]:
import numpy as np
import pandas as pd

### Creating a Series using Pandas

You could convert a list,numpy array, or dictionary to a Series in the following manner

In [2]:
labels = ['w','x','y','z']
list = [10,20,30,40]
array = np.array([10,20,30,40])
dict = {'w':10,'x':20,'y':30,'z':40}

In [3]:
pd.Series(data=list)

0    10
1    20
2    30
3    40
dtype: int64

In [4]:
pd.Series(data=list,index=labels)

w    10
x    20
y    30
z    40
dtype: int64

In [5]:
pd.Series(list,labels)

w    10
x    20
y    30
z    40
dtype: int64

** Using NumPy Arrays to create Series **

In [6]:
pd.Series(array)


0    10
1    20
2    30
3    40
dtype: int32

In [7]:
pd.Series(array,labels)

w    10
x    20
y    30
z    40
dtype: int32

** Using Dictionary to create series **

In [9]:
pd.Series(dict)

w    10
x    20
y    30
z    40
dtype: int64

## Using an Index

In [10]:
sports1 = pd.Series([1,2,3,4],index = ['Cricket', 'Football','Basketball', 'Golf'])                                   

In [11]:
sports1

Cricket       1
Football      2
Basketball    3
Golf          4
dtype: int64

# DataFrames

DataFrames concept in python is similar to that of R programming language. DataFrame is a collection of Series combined together to share the same index positions.

In [14]:
from numpy.random import randn

In [15]:
dataframe = pd.DataFrame(randn(10,5),index='A B C D E F G H I J'.split(),columns='Score1 Score2 Score3 Score4 Score5'.split())

In [16]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,1.013975,0.723518,-0.978742,0.45438,-1.544132
B,0.820591,1.747594,-0.044519,0.009111,1.359611
C,-0.802503,0.502048,0.409312,-0.78387,0.529024
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274
G,-0.701579,-0.971646,0.620635,0.028203,0.469608
H,0.641931,-0.078682,0.316877,0.533879,2.888693
I,0.14726,0.691469,0.248621,0.979603,0.240132
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601


## Selection and Indexing

Ways in which we can grab data from a DataFrame

In [17]:
dataframe['Score3']

A   -0.978742
B   -0.044519
C    0.409312
D   -0.068256
E    2.271213
F    1.071787
G    0.620635
H    0.316877
I    0.248621
J   -1.478876
Name: Score3, dtype: float64

In [18]:
# Pass a list of column names in any order necessary
dataframe[['Score2','Score1']]

Unnamed: 0,Score2,Score1
A,0.723518,1.013975
B,1.747594,0.820591
C,0.502048,-0.802503
D,-0.981119,-0.065241
E,-1.118642,-0.210426
F,-0.314247,1.254411
G,-0.971646,-0.701579
H,-0.078682,0.641931
I,0.691469,0.14726
J,0.399261,-2.141768


**Adding a new column to the DataFrame**

In [25]:
dataframe['Score6']=dataframe['Score1'] + dataframe['Score2'] 

In [26]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Score6
A,1.013975,0.723518,-0.978742,0.45438,-1.544132,1.737493
B,0.820591,1.747594,-0.044519,0.009111,1.359611,2.568186
C,-0.802503,0.502048,0.409312,-0.78387,0.529024,-0.300454
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371,-1.046359
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774,-1.329068
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274,0.940164
G,-0.701579,-0.971646,0.620635,0.028203,0.469608,-1.673225
H,0.641931,-0.078682,0.316877,0.533879,2.888693,0.56325
I,0.14726,0.691469,0.248621,0.979603,0.240132,0.838729
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601,-1.742507


** Removing Columns from DataFrame**

In [21]:
dataframe.drop('Score6',axis=1)              # Use axis=0 for dropping rows and axis=1 for dropping columns

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,1.013975,0.723518,-0.978742,0.45438,-1.544132
B,0.820591,1.747594,-0.044519,0.009111,1.359611
C,-0.802503,0.502048,0.409312,-0.78387,0.529024
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274
G,-0.701579,-0.971646,0.620635,0.028203,0.469608
H,0.641931,-0.078682,0.316877,0.533879,2.888693
I,0.14726,0.691469,0.248621,0.979603,0.240132
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601


In [22]:
# column is not dropped unless inplace input is TRUE
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Score6
A,1.013975,0.723518,-0.978742,0.45438,-1.544132,1.737493
B,0.820591,1.747594,-0.044519,0.009111,1.359611,2.568186
C,-0.802503,0.502048,0.409312,-0.78387,0.529024,-0.300454
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371,-1.046359
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774,-1.329068
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274,0.940164
G,-0.701579,-0.971646,0.620635,0.028203,0.469608,-1.673225
H,0.641931,-0.078682,0.316877,0.533879,2.888693,0.56325
I,0.14726,0.691469,0.248621,0.979603,0.240132,0.838729
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601,-1.742507


In [27]:
dataframe.drop('Score6',axis=1,inplace=True)
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,1.013975,0.723518,-0.978742,0.45438,-1.544132
B,0.820591,1.747594,-0.044519,0.009111,1.359611
C,-0.802503,0.502048,0.409312,-0.78387,0.529024
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274
G,-0.701579,-0.971646,0.620635,0.028203,0.469608
H,0.641931,-0.078682,0.316877,0.533879,2.888693
I,0.14726,0.691469,0.248621,0.979603,0.240132
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601


In [28]:
#Dropping rows using axis=0
dataframe.drop('A',axis=0)     
# Row will also be dropped only if inplace=TRUE is given as input

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
B,0.820591,1.747594,-0.044519,0.009111,1.359611
C,-0.802503,0.502048,0.409312,-0.78387,0.529024
D,-0.065241,-0.981119,-0.068256,-1.240137,-0.552371
E,-0.210426,-1.118642,2.271213,1.355725,-0.478774
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274
G,-0.701579,-0.971646,0.620635,0.028203,0.469608
H,0.641931,-0.078682,0.316877,0.533879,2.888693
I,0.14726,0.691469,0.248621,0.979603,0.240132
J,-2.141768,0.399261,-1.478876,1.168039,-0.748601


** Selecting Rows**

In [29]:
dataframe.loc['F']

Score1    1.254411
Score2   -0.314247
Score3    1.071787
Score4   -0.065529
Score5   -1.488274
Name: F, dtype: float64

In [31]:
dataframe.iloc[2]

Score1   -0.802503
Score2    0.502048
Score3    0.409312
Score4   -0.783870
Score5    0.529024
Name: C, dtype: float64

In [32]:
dataframe.loc['A','Score1']

1.0139751492215852

In [33]:
dataframe.loc[['A','B'],['Score1','Score2']]

Unnamed: 0,Score1,Score2
A,1.013975,0.723518
B,0.820591,1.747594


### Conditional Selection

Similar to NumPy, we can make conditional selections using Brackets

In [34]:
dataframe>0.5

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,True,True,False,False,False
B,True,True,False,False,True
C,False,True,False,False,True
D,False,False,False,False,False
E,False,False,True,True,False
F,True,False,True,False,False
G,False,False,True,False,False
H,True,False,False,True,True
I,False,True,False,True,False
J,False,False,False,True,False


In [35]:
dataframe[dataframe>0.5]

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,1.013975,0.723518,,,
B,0.820591,1.747594,,,1.359611
C,,0.502048,,,0.529024
D,,,,,
E,,,2.271213,1.355725,
F,1.254411,,1.071787,,
G,,,0.620635,,
H,0.641931,,,0.533879,2.888693
I,,0.691469,,0.979603,
J,,,,1.168039,


In [36]:
dataframe[dataframe['Score1']>0.5]

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,1.013975,0.723518,-0.978742,0.45438,-1.544132
B,0.820591,1.747594,-0.044519,0.009111,1.359611
F,1.254411,-0.314247,1.071787,-0.065529,-1.488274
H,0.641931,-0.078682,0.316877,0.533879,2.888693


# Missing Data

Methods to deal with missing data in Pandas

In [37]:
dataframe = pd.DataFrame({'Cricket':[1,2,np.nan,4,6,7,2,np.nan],
                  'Baseball':[5,np.nan,np.nan,5,7,2,4,5],
                  'Tennis':[1,2,3,4,5,6,7,8]})

In [38]:
dataframe

Unnamed: 0,Cricket,Baseball,Tennis
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,4.0,5.0,4
4,6.0,7.0,5
5,7.0,2.0,6
6,2.0,4.0,7
7,,5.0,8


In [39]:
dataframe.dropna()

Unnamed: 0,Cricket,Baseball,Tennis
0,1.0,5.0,1
3,4.0,5.0,4
4,6.0,7.0,5
5,7.0,2.0,6
6,2.0,4.0,7


In [40]:
dataframe.dropna(axis=1)       # Use axis=1 for dropping columns with nan values

Unnamed: 0,Tennis
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8


In [41]:
dataframe.dropna(thresh=2)

Unnamed: 0,Cricket,Baseball,Tennis
0,1.0,5.0,1
1,2.0,,2
3,4.0,5.0,4
4,6.0,7.0,5
5,7.0,2.0,6
6,2.0,4.0,7
7,,5.0,8


In [42]:
dataframe.fillna(value=0)

Unnamed: 0,Cricket,Baseball,Tennis
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3
3,4.0,5.0,4
4,6.0,7.0,5
5,7.0,2.0,6
6,2.0,4.0,7
7,0.0,5.0,8


# Data Input and Output

Reading DataFrames from external sources using pd.read functions

In [3]:
dataframe = pd.read_csv('pandas-train.csv')

In [4]:
dataframe.to_csv('train2.csv',index=False) 