# Pandas Input/Output

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

**read_csv()**: reads data from the csv files and creates a DataFrame object.

In [3]:
path = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
df = pd.read_csv(path)
print(df.shape)
df.head(2)

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [4]:
df.to_csv('countries.csv')

In [5]:
df1 = pd.read_csv('countries.csv')
print(df1.shape)
df1.head(2)

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [6]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


The Unnamed column is the index column. To avoit it:

In [7]:
df.to_csv('countries1.csv',index=False)

In [8]:
df2 = pd.read_csv('countries1.csv')
df2.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [9]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [10]:
#!pip install xlsxwriter

In [11]:
import xlsxwriter

In [12]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

In [13]:
df.to_json('countries.json')

In [14]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isn’t known in advance, directly using standard operators has some optimization limits. 

In [15]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,-0.416605,2.523586,-0.091028,0.260955
b,-1.913254,0.323198,1.026196,0.197835
c,1.83567,0.058204,0.373246,-0.577293
d,0.300189,-2.5292,1.517771,-1.074663
e,2.104924,-0.348908,1.143179,-1.46498
f,-0.767297,-0.232218,1.347255,0.21228
g,0.290155,-0.048612,-0.138592,-0.71435
h,-1.298088,-0.939013,-0.308361,0.569989


Select all rows for a specific column:

In [16]:
df.loc[:,'A']

a   -0.416605
b   -1.913254
c    1.835670
d    0.300189
e    2.104924
f   -0.767297
g    0.290155
h   -1.298088
Name: A, dtype: float64

Another way:

In [17]:
df['A']

a   -0.416605
b   -1.913254
c    1.835670
d    0.300189
e    2.104924
f   -0.767297
g    0.290155
h   -1.298088
Name: A, dtype: float64

Select all rows for two specific column:

In [18]:
df.loc[:,['A','D']]

Unnamed: 0,A,D
a,-0.416605,0.260955
b,-1.913254,0.197835
c,1.83567,-0.577293
d,0.300189,-1.074663
e,2.104924,-1.46498
f,-0.767297,0.21228
g,0.290155,-0.71435
h,-1.298088,0.569989


Another way:

In [19]:
df[['A','D']]

Unnamed: 0,A,D
a,-0.416605,0.260955
b,-1.913254,0.197835
c,1.83567,-0.577293
d,0.300189,-1.074663
e,2.104924,-1.46498
f,-0.767297,0.21228
g,0.290155,-0.71435
h,-1.298088,0.569989


Select few rows for multiple columns:

In [20]:
df.loc[['a','b','f','h'],['A','D']]

Unnamed: 0,A,D
a,-0.416605,0.260955
b,-1.913254,0.197835
f,-0.767297,0.21228
h,-1.298088,0.569989


Select few rows for all columns:

In [21]:
df.loc[['a','b','f','h']]

Unnamed: 0,A,B,C,D
a,-0.416605,2.523586,-0.091028,0.260955
b,-1.913254,0.323198,1.026196,0.197835
f,-0.767297,-0.232218,1.347255,0.21228
h,-1.298088,-0.939013,-0.308361,0.569989


Select range of rows for all columns:

In [22]:
df.loc['a':'d']

Unnamed: 0,A,B,C,D
a,-0.416605,2.523586,-0.091028,0.260955
b,-1.913254,0.323198,1.026196,0.197835
c,1.83567,0.058204,0.373246,-0.577293
d,0.300189,-2.5292,1.517771,-1.074663


Getting all values with index 'a':

In [23]:
df.loc['a']

A   -0.416605
B    2.523586
C   -0.091028
D    0.260955
Name: a, dtype: float64

Getting values with a boolean array:

In [24]:
df.loc['a'] > 0

A    False
B     True
C    False
D     True
Name: a, dtype: bool

`.iloc()`: get purely integer based indexing. Remember these are 0-based indexing.

It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array

In [25]:
df = pd.DataFrame(np.random.randn(8, 4),columns = ['A', 'B', 'C', 'D'])

Select all columns for a specific rows:

In [26]:
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,0.563025,-1.242159,0.628724,-2.4411
1,-0.788893,2.024489,0.528358,-1.092104
2,1.119425,-0.722647,0.41359,-0.217465


Select all columns for a specific rows: (START point is INCLUDED, END point is EXCLUDED)

In [27]:
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2,1.119425,-0.722647,0.41359,-0.217465
3,-0.181219,0.419951,-0.046761,1.851334
4,-0.346059,2.014616,-0.22893,-0.396133


In [28]:
df.iloc[5:]

Unnamed: 0,A,B,C,D
5,1.052242,-1.483241,0.083687,-0.951692
6,-0.613545,-0.152512,-1.442896,0.083793
7,-1.330718,0.096765,-0.466274,-0.155365


In [29]:
# Integer Slicing
df.iloc[5:,2:]

Unnamed: 0,C,D
5,0.083687,-0.951692
6,-1.442896,0.083793
7,-0.466274,-0.155365


In [30]:
df.iloc[2:4,1:3]

Unnamed: 0,B,C
2,-0.722647,0.41359
3,0.419951,-0.046761


In [31]:
# Slicing through list of values
df.iloc[[2, 4, 6], [1, 3]]

Unnamed: 0,B,D
2,-0.722647,-0.217465
4,2.014616,-0.396133
6,-0.152512,0.083793


In [32]:
# Slicing through list of values
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
1,-0.788893,2.024489,0.528358,-1.092104
2,1.119425,-0.722647,0.41359,-0.217465


In [33]:
# Slicing through list of values
df.iloc[1:3]

Unnamed: 0,A,B,C,D
1,-0.788893,2.024489,0.528358,-1.092104
2,1.119425,-0.722647,0.41359,-0.217465


In [34]:
# Slicing through list of values
df.iloc[:,1:3]

Unnamed: 0,B,C
0,-1.242159,0.628724
1,2.024489,0.528358
2,-0.722647,0.41359
3,0.419951,-0.046761
4,2.014616,-0.22893
5,-1.483241,0.083687
6,-0.152512,-1.442896
7,0.096765,-0.466274


In [35]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
0,-1.242159,-2.4411
1,2.024489,-1.092104
2,-0.722647,-0.217465
3,0.419951,1.851334
4,2.014616,-0.396133
5,-1.483241,-0.951692
6,-0.152512,0.083793
7,0.096765,-0.155365
