# Pandas Input/Output

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read_csv: reads data from the csv files and creates a DataFrame object.
df = pd.read_csv('https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv')
print(df.shape)
df.head(2)

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [3]:
df.to_csv('countries.csv')

In [4]:
df1 = pd.read_csv('countries.csv')
print(df1.shape)
df1.head(2)

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [5]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [6]:
# The Unnamed column is the index column. To avoit it:
df.to_csv('countries1.csv',index=False)

In [7]:
df2 = pd.read_csv('countries1.csv')
df2.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [8]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [9]:
#pip install xlsxwriter

In [10]:
import xlsxwriter

In [11]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

In [12]:
df.to_json('countries.json')

In [13]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isnâ€™t known in advance, directly using standard operators has some optimization limits. 

In [14]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,-0.824068,0.277242,0.957826,1.029719
b,1.390744,-0.053485,0.524142,0.613801
c,0.256386,0.650569,0.299202,-0.58156
d,0.147646,-1.309497,-0.763611,0.152258
e,0.775165,-0.020633,0.041341,-0.70304
f,-0.28262,0.678125,0.177785,0.883451
g,0.854318,-0.120538,0.247675,2.003215
h,-1.05635,-0.848882,0.247997,-1.660345


In [15]:
# select all rows for a specific column
df.loc[:,'A']

a   -0.824068
b    1.390744
c    0.256386
d    0.147646
e    0.775165
f   -0.282620
g    0.854318
h   -1.056350
Name: A, dtype: float64

In [16]:
# Other way
df['A']

a   -0.824068
b    1.390744
c    0.256386
d    0.147646
e    0.775165
f   -0.282620
g    0.854318
h   -1.056350
Name: A, dtype: float64

In [17]:
# select all rows for two specific column
df.loc[:,['A','D']]

Unnamed: 0,A,D
a,-0.824068,1.029719
b,1.390744,0.613801
c,0.256386,-0.58156
d,0.147646,0.152258
e,0.775165,-0.70304
f,-0.28262,0.883451
g,0.854318,2.003215
h,-1.05635,-1.660345


In [18]:
# other way
df[['A','D']]

Unnamed: 0,A,D
a,-0.824068,1.029719
b,1.390744,0.613801
c,0.256386,-0.58156
d,0.147646,0.152258
e,0.775165,-0.70304
f,-0.28262,0.883451
g,0.854318,2.003215
h,-1.05635,-1.660345


In [19]:
# Select few rows for multiple columns
df.loc[['a','b','f','h'],['A','D']]

Unnamed: 0,A,D
a,-0.824068,1.029719
b,1.390744,0.613801
f,-0.28262,0.883451
h,-1.05635,-1.660345


In [20]:
# Select few rows for all columns
df.loc[['a','b','f','h']]

Unnamed: 0,A,B,C,D
a,-0.824068,0.277242,0.957826,1.029719
b,1.390744,-0.053485,0.524142,0.613801
f,-0.28262,0.678125,0.177785,0.883451
h,-1.05635,-0.848882,0.247997,-1.660345


In [21]:
# Select range of rows for all columns
df.loc['a':'d']

Unnamed: 0,A,B,C,D
a,-0.824068,0.277242,0.957826,1.029719
b,1.390744,-0.053485,0.524142,0.613801
c,0.256386,0.650569,0.299202,-0.58156
d,0.147646,-1.309497,-0.763611,0.152258


In [22]:
# getting all values with index 'a'
df.loc['a']

A   -0.824068
B    0.277242
C    0.957826
D    1.029719
Name: a, dtype: float64

In [23]:
# getting values with a boolean array
df.loc['a'] > 0

A    False
B     True
C     True
D     True
Name: a, dtype: bool

`.iloc()`: get purely integer based indexing. Remember these are 0-based indexing.

It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array

In [24]:
df = pd.DataFrame(np.random.randn(8, 4),columns = ['A', 'B', 'C', 'D'])

In [25]:
# select all columns for a specific rows
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,-1.071311,0.625029,0.056439,-1.314934
1,-1.069284,1.018209,-1.287638,0.469411
2,0.737471,0.339346,0.695109,0.989799


In [26]:
# select all columns for a specific rows (START point is INCLUDED, END point is EXCLUDED)
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2,0.737471,0.339346,0.695109,0.989799
3,-0.132159,-0.862307,-0.715084,-0.220501
4,0.963155,0.593142,0.25304,-0.92004


In [27]:
df.iloc[5:]

Unnamed: 0,A,B,C,D
5,-0.772041,0.768257,1.066481,-1.022398
6,2.01012,-1.116248,-0.196649,0.015375
7,-0.651416,-0.715123,1.457635,1.074149


In [28]:
# Integer Slicing
df.iloc[5:,2:]

Unnamed: 0,C,D
5,1.066481,-1.022398
6,-0.196649,0.015375
7,1.457635,1.074149


In [29]:
df.iloc[2:4,1:3]

Unnamed: 0,B,C
2,0.339346,0.695109
3,-0.862307,-0.715084


In [30]:
# Slicing through list of values
df.iloc[[2, 4, 6], [1, 3]]

Unnamed: 0,B,D
2,0.339346,0.989799
4,0.593142,-0.92004
6,-1.116248,0.015375


In [31]:
# Slicing through list of values
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
1,-1.069284,1.018209,-1.287638,0.469411
2,0.737471,0.339346,0.695109,0.989799


In [32]:
# Slicing through list of values
df.iloc[1:3]

Unnamed: 0,A,B,C,D
1,-1.069284,1.018209,-1.287638,0.469411
2,0.737471,0.339346,0.695109,0.989799


In [33]:
# Slicing through list of values
df.iloc[:,1:3]

Unnamed: 0,B,C
0,0.625029,0.056439
1,1.018209,-1.287638
2,0.339346,0.695109
3,-0.862307,-0.715084
4,0.593142,0.25304
5,0.768257,1.066481
6,-1.116248,-0.196649
7,-0.715123,1.457635


In [34]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
0,0.625029,-1.314934
1,1.018209,0.469411
2,0.339346,0.989799
3,-0.862307,-0.220501
4,0.593142,-0.92004
5,0.768257,-1.022398
6,-1.116248,0.015375
7,-0.715123,1.074149
