# Pandas Input/Output

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
# read_csv: reads data from the csv files and creates a DataFrame object.
df = pd.read_csv('https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv')
print(df.shape)
df.head(2)

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [4]:
df.to_csv('countries.csv')

In [5]:
df1 = pd.read_csv('countries.csv')
print(df1.shape)
df1.head(2)

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [6]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [7]:
# The Unnamed column is the index column. To avoit it:
df.to_csv('countries1.csv',index=False)

In [8]:
df2 = pd.read_csv('countries1.csv')
df2.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [9]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [10]:
#!pip install xlsxwriter

In [11]:
import xlsxwriter

In [12]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

In [13]:
df.to_json('countries.json')

In [14]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isn’t known in advance, directly using standard operators has some optimization limits. 

In [15]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,1.126803,0.863477,0.298589,-0.343339
b,1.164549,-0.944366,1.444286,1.156773
c,-0.846982,0.026966,0.42399,0.525681
d,1.405735,0.521341,-0.219109,0.858832
e,-0.141277,-0.680227,-1.151565,-0.134687
f,-0.472646,1.579302,1.294815,-0.432016
g,1.542131,-0.080047,0.404716,-1.023734
h,1.957213,0.624003,0.495357,-0.812024


In [16]:
# select all rows for a specific column
df.loc[:,'A']

a    1.126803
b    1.164549
c   -0.846982
d    1.405735
e   -0.141277
f   -0.472646
g    1.542131
h    1.957213
Name: A, dtype: float64

In [17]:
# Other way
df['A']

a    1.126803
b    1.164549
c   -0.846982
d    1.405735
e   -0.141277
f   -0.472646
g    1.542131
h    1.957213
Name: A, dtype: float64

In [18]:
# select all rows for two specific column
df.loc[:,['A','D']]

Unnamed: 0,A,D
a,1.126803,-0.343339
b,1.164549,1.156773
c,-0.846982,0.525681
d,1.405735,0.858832
e,-0.141277,-0.134687
f,-0.472646,-0.432016
g,1.542131,-1.023734
h,1.957213,-0.812024


In [19]:
# other way
df[['A','D']]

Unnamed: 0,A,D
a,1.126803,-0.343339
b,1.164549,1.156773
c,-0.846982,0.525681
d,1.405735,0.858832
e,-0.141277,-0.134687
f,-0.472646,-0.432016
g,1.542131,-1.023734
h,1.957213,-0.812024


In [20]:
# Select few rows for multiple columns
df.loc[['a','b','f','h'],['A','D']]

Unnamed: 0,A,D
a,1.126803,-0.343339
b,1.164549,1.156773
f,-0.472646,-0.432016
h,1.957213,-0.812024


In [21]:
# Select few rows for all columns
df.loc[['a','b','f','h']]

Unnamed: 0,A,B,C,D
a,1.126803,0.863477,0.298589,-0.343339
b,1.164549,-0.944366,1.444286,1.156773
f,-0.472646,1.579302,1.294815,-0.432016
h,1.957213,0.624003,0.495357,-0.812024


In [22]:
# Select range of rows for all columns
df.loc['a':'d']

Unnamed: 0,A,B,C,D
a,1.126803,0.863477,0.298589,-0.343339
b,1.164549,-0.944366,1.444286,1.156773
c,-0.846982,0.026966,0.42399,0.525681
d,1.405735,0.521341,-0.219109,0.858832


In [23]:
# getting all values with index 'a'
df.loc['a']

A    1.126803
B    0.863477
C    0.298589
D   -0.343339
Name: a, dtype: float64

In [24]:
# getting values with a boolean array
df.loc['a'] > 0

A     True
B     True
C     True
D    False
Name: a, dtype: bool

`.iloc()`: get purely integer based indexing. Remember these are 0-based indexing.

It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array

In [25]:
df = pd.DataFrame(np.random.randn(8, 4),columns = ['A', 'B', 'C', 'D'])

In [26]:
# select all columns for a specific rows
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,-0.50799,1.601479,-2.113351,0.97618
1,-0.280544,-0.57995,-1.115814,-0.120316
2,-1.515031,-1.409341,0.522388,0.981942


In [27]:
# select all columns for a specific rows (START point is INCLUDED, END point is EXCLUDED)
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2,-1.515031,-1.409341,0.522388,0.981942
3,0.509661,1.268201,-2.535409,0.61758
4,-0.055191,-1.759767,-0.289831,0.373842


In [28]:
df.iloc[5:]

Unnamed: 0,A,B,C,D
5,1.145577,0.281784,-0.373822,-1.296222
6,-2.225834,-0.02547,-0.647116,0.517882
7,1.198498,0.427752,0.928001,0.251673


In [29]:
# Integer Slicing
df.iloc[5:,2:]

Unnamed: 0,C,D
5,-0.373822,-1.296222
6,-0.647116,0.517882
7,0.928001,0.251673


In [30]:
df.iloc[2:4,1:3]

Unnamed: 0,B,C
2,-1.409341,0.522388
3,1.268201,-2.535409


In [31]:
# Slicing through list of values
df.iloc[[2, 4, 6], [1, 3]]

Unnamed: 0,B,D
2,-1.409341,0.981942
4,-1.759767,0.373842
6,-0.02547,0.517882


In [32]:
# Slicing through list of values
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
1,-0.280544,-0.57995,-1.115814,-0.120316
2,-1.515031,-1.409341,0.522388,0.981942


In [33]:
# Slicing through list of values
df.iloc[1:3]

Unnamed: 0,A,B,C,D
1,-0.280544,-0.57995,-1.115814,-0.120316
2,-1.515031,-1.409341,0.522388,0.981942


In [34]:
# Slicing through list of values
df.iloc[:,1:3]

Unnamed: 0,B,C
0,1.601479,-2.113351
1,-0.57995,-1.115814
2,-1.409341,0.522388
3,1.268201,-2.535409
4,-1.759767,-0.289831
5,0.281784,-0.373822
6,-0.02547,-0.647116
7,0.427752,0.928001


In [35]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
0,1.601479,0.97618
1,-0.57995,-0.120316
2,-1.409341,0.981942
3,1.268201,0.61758
4,-1.759767,0.373842
5,0.281784,-1.296222
6,-0.02547,0.517882
7,0.427752,0.251673
