# Pandas Input/Output

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

**read_csv()**: reads data from the csv files and creates a DataFrame object.

In [3]:
path = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
df = pd.read_csv(path)
print(df.shape)
df.head(2)

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [4]:
df.to_csv('countries.csv')

In [5]:
df1 = pd.read_csv('countries.csv')
print(df1.shape)
df1.head(2)

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [6]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


The Unnamed column is the index column. To avoit it:

In [7]:
df.to_csv('countries1.csv',index=False)

In [8]:
df2 = pd.read_csv('countries1.csv')
df2.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [9]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [10]:
#!pip install xlsxwriter

In [11]:
import xlsxwriter

ModuleNotFoundError: No module named 'xlsxwriter'

In [12]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

ModuleNotFoundError: No module named 'xlsxwriter'

In [13]:
df.to_json('countries.json')

In [14]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isn’t known in advance, directly using standard operators has some optimization limits. 

In [15]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,1.288156,-0.243516,1.0352,0.293171
b,-1.88219,1.897106,-0.199146,0.195226
c,-0.834196,-1.699265,-0.633359,0.154093
d,0.155633,-0.369844,-0.923504,-0.685302
e,0.694731,-1.392908,2.764371,-0.171315
f,0.069221,0.834862,0.625122,0.471261
g,-0.806985,-0.925228,1.276008,1.067609
h,-0.122286,-0.760014,-0.41498,-0.981224


Select all rows for a specific column:

In [16]:
df.loc[:,'A']

a    1.288156
b   -1.882190
c   -0.834196
d    0.155633
e    0.694731
f    0.069221
g   -0.806985
h   -0.122286
Name: A, dtype: float64

Another way:

In [17]:
df['A']

a    1.288156
b   -1.882190
c   -0.834196
d    0.155633
e    0.694731
f    0.069221
g   -0.806985
h   -0.122286
Name: A, dtype: float64

Select all rows for two specific column:

In [18]:
df.loc[:,['A','D']]

Unnamed: 0,A,D
a,1.288156,0.293171
b,-1.88219,0.195226
c,-0.834196,0.154093
d,0.155633,-0.685302
e,0.694731,-0.171315
f,0.069221,0.471261
g,-0.806985,1.067609
h,-0.122286,-0.981224


Another way:

In [19]:
df[['A','D']]

Unnamed: 0,A,D
a,1.288156,0.293171
b,-1.88219,0.195226
c,-0.834196,0.154093
d,0.155633,-0.685302
e,0.694731,-0.171315
f,0.069221,0.471261
g,-0.806985,1.067609
h,-0.122286,-0.981224


Select few rows for multiple columns:

In [20]:
df.loc[['a','b','f','h'],['A','D']]

Unnamed: 0,A,D
a,1.288156,0.293171
b,-1.88219,0.195226
f,0.069221,0.471261
h,-0.122286,-0.981224


Select few rows for all columns:

In [21]:
df.loc[['a','b','f','h']]

Unnamed: 0,A,B,C,D
a,1.288156,-0.243516,1.0352,0.293171
b,-1.88219,1.897106,-0.199146,0.195226
f,0.069221,0.834862,0.625122,0.471261
h,-0.122286,-0.760014,-0.41498,-0.981224


Select range of rows for all columns:

In [22]:
df.loc['a':'d']

Unnamed: 0,A,B,C,D
a,1.288156,-0.243516,1.0352,0.293171
b,-1.88219,1.897106,-0.199146,0.195226
c,-0.834196,-1.699265,-0.633359,0.154093
d,0.155633,-0.369844,-0.923504,-0.685302


Getting all values with index 'a':

In [23]:
df.loc['a']

A    1.288156
B   -0.243516
C    1.035200
D    0.293171
Name: a, dtype: float64

Getting values with a boolean array:

In [24]:
df.loc['a'] > 0

A     True
B    False
C     True
D     True
Name: a, dtype: bool

`.iloc()`: get purely integer based indexing. Remember these are 0-based indexing.

It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array

In [25]:
df = pd.DataFrame(np.random.randn(8, 4),columns = ['A', 'B', 'C', 'D'])

Select all columns for a specific rows:

In [26]:
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,1.128326,-1.016426,2.176676,-1.001841
1,-0.547147,0.647736,2.061027,-0.633981
2,-1.124233,-0.920553,1.094191,-0.566675


Select all columns for a specific rows: (START point is INCLUDED, END point is EXCLUDED)

In [27]:
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2,-1.124233,-0.920553,1.094191,-0.566675
3,-1.212471,0.329929,-1.098116,1.406701
4,0.908864,0.164289,0.007481,1.663929


In [28]:
df.iloc[5:]

Unnamed: 0,A,B,C,D
5,1.373222,-0.101277,-0.403698,1.079366
6,-0.654525,0.206636,-1.760025,-0.898749
7,0.504036,-0.518746,1.231002,-0.380985


In [29]:
# Integer Slicing
df.iloc[5:,2:]

Unnamed: 0,C,D
5,-0.403698,1.079366
6,-1.760025,-0.898749
7,1.231002,-0.380985


In [30]:
df.iloc[2:4,1:3]

Unnamed: 0,B,C
2,-0.920553,1.094191
3,0.329929,-1.098116


In [31]:
# Slicing through list of values
df.iloc[[2, 4, 6], [1, 3]]

Unnamed: 0,B,D
2,-0.920553,-0.566675
4,0.164289,1.663929
6,0.206636,-0.898749


In [32]:
# Slicing through list of values
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
1,-0.547147,0.647736,2.061027,-0.633981
2,-1.124233,-0.920553,1.094191,-0.566675


In [33]:
# Slicing through list of values
df.iloc[1:3]

Unnamed: 0,A,B,C,D
1,-0.547147,0.647736,2.061027,-0.633981
2,-1.124233,-0.920553,1.094191,-0.566675


In [34]:
# Slicing through list of values
df.iloc[:,1:3]

Unnamed: 0,B,C
0,-1.016426,2.176676
1,0.647736,2.061027
2,-0.920553,1.094191
3,0.329929,-1.098116
4,0.164289,0.007481
5,-0.101277,-0.403698
6,0.206636,-1.760025
7,-0.518746,1.231002


In [35]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
0,-1.016426,-1.001841
1,0.647736,-0.633981
2,-0.920553,-0.566675
3,0.329929,1.406701
4,0.164289,1.663929
5,-0.101277,1.079366
6,0.206636,-0.898749
7,-0.518746,-0.380985


Reference:
- VanderPlas, J. (2017) Python Data Science Handbook: Essential Tools for Working with Data. USA: O’Reilly Media, Inc. chapter 3