# 5 Simple Python Ways to Open and Read Your Dataset

In [1]:
import string

In [2]:
file1 = "SMSSpamCollection"
file2 = 'sat_water_table.txt'

## Read as average file

In [3]:
content_lines = []

In [4]:
with open(file1, "r", encoding='utf-8') as content:
    for line in content:
        content_lines.append(line.translate(str.maketrans(' ',' ', string.punctuation)).lower().split()[1:])

In [5]:
print(content_lines[0])

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


## Read file with custom encoding

In [6]:
content_codecs = []

In [7]:
import codecs

with codecs.open(file1, 'r', encoding='utf-8' ) as content:
    while True:
        line = content.readline()
        if not line:
            break
        content_codecs.append(line.translate(str.maketrans(' ',' ', string.punctuation)).lower().split()[1:])

In [8]:
print(content_codecs[0])

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


## Read with standard csv reader

In [9]:
content_csv_space = []
content_csv_tab = []

In [10]:
import csv

with open(file1, newline='', encoding='utf-8') as csvfile:
    spamreader_space = csv.reader(csvfile, delimiter=' ')
    for row in spamreader_space:
        content_csv_space.append(row)

with open(file1, newline='', encoding='utf-8') as csvfile:
    spamreader_tab = csv.reader(csvfile, delimiter='\t')
    for row in spamreader_tab:
        content_csv_tab.append(row)

In [11]:
print(content_csv_space[0])
print(content_csv_tab[0])

['ham\tGo', 'until', 'jurong', 'point,', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'Cine', 'there', 'got', 'amore', 'wat...']
['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']


## Read with pandas

In [12]:
import pandas as pd

table_numbers = pd.read_table(file2, delimiter='\t')

In [13]:
table_numbers

Unnamed: 0,"Temperature, degC","Pressure, bar","hf, kJ/kg","hg, kJ/kg","sf, kJ/(kg K)","sg, kJ/(kg K)","vf, m^3/kg","vg, m^3/kg"
0,0.01,0.006117,0.000612,2500.910995,0.000000,9.155491,0.001000,205.997459
1,4.00,0.008135,16.812717,2508.237465,0.061101,9.050556,0.001000,157.121332
2,5.00,0.008726,21.019356,2510.071717,0.076252,9.024859,0.001000,147.016865
3,6.00,0.009354,25.223663,2511.905130,0.091340,8.999397,0.001000,137.638180
4,8.00,0.010730,33.626017,2515.569328,0.121332,8.949167,0.001000,120.834430
...,...,...,...,...,...,...,...,...
65,300.00,85.877083,1344.771339,2749.573743,3.254741,5.705764,0.001404,0.021663
66,320.00,112.838559,1462.051009,2700.667687,3.449116,5.537319,0.001499,0.015476
67,340.00,146.001811,1594.446570,2622.066748,3.659949,5.335912,0.001638,0.010784
68,360.00,186.663711,1761.491759,2481.000000,3.916358,5.052600,0.001895,0.006945


## Read with numpy

In [14]:
import numpy as np
np_numbers = np.loadtxt('sat_water_table.txt', skiprows=1, delimiter='\t')

In [15]:
np.set_printoptions(suppress=True)
np_numbers[:2]

array([[   0.01      ,    0.00611657,    0.00061178, 2500.91099464,
           0.        ,    9.15549147,    0.00100021,  205.99745949],
       [   4.        ,    0.00813549,   16.8127173 , 2508.23746453,
           0.06110098,    9.05055623,    0.00100007,  157.12133225]])