# Pandas Input/Output

In [1]:
# !pip install numpy
# !pip install pandas

In [1]:
import numpy as np
import pandas as pd

**read_csv()**: reads data from the csv files and creates a DataFrame object.

In [2]:
path = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
df = pd.read_csv(path)
print(df.shape)
df.head(2)

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [3]:
df.to_csv('countries.csv')

In [4]:
df1 = pd.read_csv('countries.csv')
print(df1.shape)
df1.head(2)

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [5]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


The Unnamed column is the index column. To avoit it:

In [6]:
df.to_csv('countries1.csv',index=False)

In [7]:
df2 = pd.read_csv('countries1.csv')
df2.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


In [8]:
display(df.head(2))
display(df1.head(2))

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA


In [9]:
#!pip install xlsxwriter

In [10]:
import xlsxwriter

In [11]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

In [12]:
df.to_json('countries.json')

In [13]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isn’t known in advance, directly using standard operators has some optimization limits. 

In [14]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,0.37102,0.138565,-0.48167,0.620125
b,1.062209,-1.571751,-0.028899,-1.369734
c,0.989922,-1.131,1.291188,0.336638
d,0.134473,0.314459,-1.23086,-0.183485
e,-0.439398,0.48632,0.208711,-0.002978
f,0.00776,0.046532,0.781376,0.391646
g,-1.817916,0.671497,0.449494,-1.522704
h,-1.260155,0.12614,-0.121182,-1.678111


Select all rows for a specific column:

In [15]:
df.loc[:,'A']

a    0.371020
b    1.062209
c    0.989922
d    0.134473
e   -0.439398
f    0.007760
g   -1.817916
h   -1.260155
Name: A, dtype: float64

Another way:

In [16]:
df['A']

a    0.371020
b    1.062209
c    0.989922
d    0.134473
e   -0.439398
f    0.007760
g   -1.817916
h   -1.260155
Name: A, dtype: float64

Select all rows for two specific column:

In [17]:
df.loc[:,['A','D']]

Unnamed: 0,A,D
a,0.37102,0.620125
b,1.062209,-1.369734
c,0.989922,0.336638
d,0.134473,-0.183485
e,-0.439398,-0.002978
f,0.00776,0.391646
g,-1.817916,-1.522704
h,-1.260155,-1.678111


Another way:

In [18]:
df[['A','D']]

Unnamed: 0,A,D
a,0.37102,0.620125
b,1.062209,-1.369734
c,0.989922,0.336638
d,0.134473,-0.183485
e,-0.439398,-0.002978
f,0.00776,0.391646
g,-1.817916,-1.522704
h,-1.260155,-1.678111


Select few rows for multiple columns:

In [19]:
df.loc[['a','b','f','h'],['A','D']]

Unnamed: 0,A,D
a,0.37102,0.620125
b,1.062209,-1.369734
f,0.00776,0.391646
h,-1.260155,-1.678111


Select few rows for all columns:

In [20]:
df.loc[['a','b','f','h']]

Unnamed: 0,A,B,C,D
a,0.37102,0.138565,-0.48167,0.620125
b,1.062209,-1.571751,-0.028899,-1.369734
f,0.00776,0.046532,0.781376,0.391646
h,-1.260155,0.12614,-0.121182,-1.678111


Select range of rows for all columns:

In [21]:
df.loc['a':'d']

Unnamed: 0,A,B,C,D
a,0.37102,0.138565,-0.48167,0.620125
b,1.062209,-1.571751,-0.028899,-1.369734
c,0.989922,-1.131,1.291188,0.336638
d,0.134473,0.314459,-1.23086,-0.183485


Getting all values with index 'a':

In [22]:
df.loc['a']

A    0.371020
B    0.138565
C   -0.481670
D    0.620125
Name: a, dtype: float64

Getting values with a boolean array:

In [23]:
df.loc['a'] > 0

A     True
B     True
C    False
D     True
Name: a, dtype: bool

`.iloc()`: get purely integer based indexing. Remember these are 0-based indexing.

It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array

In [24]:
df = pd.DataFrame(np.random.randn(8, 4),columns = ['A', 'B', 'C', 'D'])

Select all columns for a specific rows:

In [25]:
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,0.144474,0.819853,0.349376,-1.068588
1,-1.732468,-0.049538,-2.036552,0.406665
2,0.879394,-0.147031,-0.65466,-0.586319


Select all columns for a specific rows: (START point is INCLUDED, END point is EXCLUDED)

In [26]:
df.iloc[2:5]

Unnamed: 0,A,B,C,D
2,0.879394,-0.147031,-0.65466,-0.586319
3,-0.55411,-1.369475,1.495081,2.037702
4,-0.010589,-1.933997,-0.340986,-0.454538


In [27]:
df.iloc[5:]

Unnamed: 0,A,B,C,D
5,0.059615,0.233289,1.245236,0.484754
6,0.267965,0.421298,0.500368,0.003211
7,0.869325,1.584976,-0.891913,2.613731


In [28]:
# Integer Slicing
df.iloc[5:,2:]

Unnamed: 0,C,D
5,1.245236,0.484754
6,0.500368,0.003211
7,-0.891913,2.613731


In [29]:
df.iloc[2:4,1:3]

Unnamed: 0,B,C
2,-0.147031,-0.65466
3,-1.369475,1.495081


In [30]:
# Slicing through list of values
df.iloc[[2, 4, 6], [1, 3]]

Unnamed: 0,B,D
2,-0.147031,-0.586319
4,-1.933997,-0.454538
6,0.421298,0.003211


In [31]:
# Slicing through list of values
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
1,-1.732468,-0.049538,-2.036552,0.406665
2,0.879394,-0.147031,-0.65466,-0.586319


In [32]:
# Slicing through list of values
df.iloc[1:3]

Unnamed: 0,A,B,C,D
1,-1.732468,-0.049538,-2.036552,0.406665
2,0.879394,-0.147031,-0.65466,-0.586319


In [33]:
# Slicing through list of values
df.iloc[:,1:3]

Unnamed: 0,B,C
0,0.819853,0.349376
1,-0.049538,-2.036552
2,-0.147031,-0.65466
3,-1.369475,1.495081
4,-1.933997,-0.340986
5,0.233289,1.245236
6,0.421298,0.500368
7,1.584976,-0.891913


In [34]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
0,0.819853,-1.068588
1,-0.049538,0.406665
2,-0.147031,-0.586319
3,-1.369475,2.037702
4,-1.933997,-0.454538
5,0.233289,0.484754
6,0.421298,0.003211
7,1.584976,2.613731
