# Pandas Input/Output

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
#import numpy as np
import pandas as pd

## Reading data from `.csv` file

`read_csv()`: reads data from the csv files and creates a DataFrame object.

In [3]:
path = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
df = pd.read_csv(path)
print(df.shape)
df.head()

(194, 2)


Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


In [4]:
# Save the dataframe to a csv file
df.to_csv('countries.csv')

In [5]:
# Readin the local csv file
df2 = pd.read_csv('countries.csv')
print(df2.shape)
df2.head()

(194, 3)


Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA
2,2,Benin,AFRICA
3,3,Botswana,AFRICA
4,4,Burkina,AFRICA


The two dataframes are slightly different. Let's see.

In [6]:
# the first DataFrame
df.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


In [7]:
# the second DataFrame
df2.head()

Unnamed: 0.1,Unnamed: 0,Country,Region
0,0,Algeria,AFRICA
1,1,Angola,AFRICA
2,2,Benin,AFRICA
3,3,Botswana,AFRICA
4,4,Burkina,AFRICA


The Unnamed column is the index column. How to avoit it?

In [8]:
# Save the dataframe to a csv file without the index column
df.to_csv('countries1.csv',index=False)

In [9]:
# Reading countries1.csv without the index column
df3 = pd.read_csv('countries1.csv')
df3.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


## Reading data from UCI Machine Learning Repository

We will use the Automobile Data Set [https://archive.ics.uci.edu/ml/datasets/automobile] from the UCI Machine Learning Repository [https://archive-beta.ics.uci.edu/].

In [10]:
# Definiong the headers
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

In [11]:
dfcar = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
dfcar.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


## Reading data from `.arff` file

Student Academics Performance Dataset

In [12]:
from io import StringIO
import urllib.request
from scipy.io.arff import loadarff

In [13]:
stAcademic_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00467/Sapfile1.arff"
resp = urllib.request.urlopen(stAcademic_url)

In [14]:
data, meta = loadarff(StringIO(resp.read().decode('utf-8')))

In [15]:
# data contains the data and meta contains the metadata
meta

Dataset: Sapfile1
	ge's type is nominal, range is ('M', 'F')
	cst's type is nominal, range is ('G', 'ST', 'SC', 'OBC', 'MOBC')
	tnp's type is nominal, range is ('Best', 'Vg', 'Good', 'Pass', 'Fail')
	twp's type is nominal, range is ('Best', 'Vg', 'Good', 'Pass', 'Fail')
	iap's type is nominal, range is ('Best', 'Vg', 'Good', 'Pass', 'Fail')
	esp's type is nominal, range is ('Best', 'Vg', 'Good', 'Pass', 'Fail')
	arr's type is nominal, range is ('Y', 'N')
	ms's type is nominal, range is ('Married', 'Unmarried')
	ls's type is nominal, range is ('T', 'V')
	as's type is nominal, range is ('Free', 'Paid')
	fmi's type is nominal, range is ('Vh', 'High', 'Am', 'Medium', 'Low')
	fs's type is nominal, range is ('Large', 'Average', 'Small')
	fq's type is nominal, range is ('Il', 'Um', '10', '12', 'Degree', 'Pg')
	mq's type is nominal, range is ('Il', 'Um', '10', '12', 'Degree', 'Pg')
	fo's type is nominal, range is ('Service', 'Business', 'Retired', 'Farmer', 'Others')
	mo's type is nominal, ran

In [16]:
columns_name = list(meta._attributes.keys())
df = pd.DataFrame(data, columns=columns_name)
df.head(3)

Unnamed: 0,ge,cst,tnp,twp,iap,esp,arr,ms,ls,as,...,fq,mq,fo,mo,nf,sh,ss,me,tt,atd
0,b'F',b'G',b'Good',b'Good',b'Vg',b'Good',b'Y',b'Unmarried',b'V',b'Paid',...,b'Um',b'10',b'Farmer',b'Housewife',b'Large',b'Poor',b'Govt',b'Asm',b'Small',b'Good'
1,b'M',b'OBC',b'Vg',b'Vg',b'Vg',b'Vg',b'N',b'Unmarried',b'V',b'Paid',...,b'Um',b'Il',b'Service',b'Service',b'Small',b'Poor',b'Govt',b'Asm',b'Average',b'Average'
2,b'F',b'OBC',b'Good',b'Good',b'Vg',b'Good',b'N',b'Unmarried',b'V',b'Paid',...,b'12',b'10',b'Service',b'Housewife',b'Average',b'Average',b'Govt',b'Asm',b'Large',b'Good'


In some cases, the integer columns are read as objects; for instance, instead of 2, we have b'2'. We go over the object columns and decode them again to solve this problem.

In [17]:
# decoding the object columns
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].str.decode('UTF-8')
df.head()

Unnamed: 0,ge,cst,tnp,twp,iap,esp,arr,ms,ls,as,...,fq,mq,fo,mo,nf,sh,ss,me,tt,atd
0,F,G,Good,Good,Vg,Good,Y,Unmarried,V,Paid,...,Um,10,Farmer,Housewife,Large,Poor,Govt,Asm,Small,Good
1,M,OBC,Vg,Vg,Vg,Vg,N,Unmarried,V,Paid,...,Um,Il,Service,Service,Small,Poor,Govt,Asm,Average,Average
2,F,OBC,Good,Good,Vg,Good,N,Unmarried,V,Paid,...,12,10,Service,Housewife,Average,Average,Govt,Asm,Large,Good
3,M,MOBC,Pass,Good,Vg,Good,N,Unmarried,V,Paid,...,12,Um,Business,Business,Large,Poor,Govt,Asm,Average,Average
4,M,G,Good,Good,Vg,Vg,N,Unmarried,V,Paid,...,10,12,Service,Housewife,Large,Poor,Private,Asm,Small,Good


In [18]:
#!pip install xlsxwriter

In [19]:
import xlsxwriter

ModuleNotFoundError: No module named 'xlsxwriter'

In [None]:
writer = pd.ExcelWriter('countries.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='WithoutIndex')
df1.to_excel(writer,sheet_name='WithIndex')
writer.save()

ModuleNotFoundError: No module named 'xlsxwriter'

In [None]:
df.to_json('countries.json')

In [None]:
dfj = pd.read_json('countries.json')
dfj.head(2)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA


Reference:
- VanderPlas, J. (2017) Python Data Science Handbook: Essential Tools for Working with Data. USA: O’Reilly Media, Inc. chapter 3