In [1]:
import csv
import numpy as np
import pandas as pd

In [11]:
fname = "spambase.data"


# Using csv module - least efficient

with open(fname, "r") as f:
    data = list(csv.reader(f, delimiter=","))
data = np.array(data)
print(data.shape)
n_samples, n_features = data.shape

n_features -= 1
X = data[:, 0:n_features]
y = data[:, n_features]
print(X.shape)
print(y.shape)

(4601, 58)
(4601, 57)
(4601,)


In [12]:
# Using numpy better
data = np.loadtxt(fname, delimiter=",")
print(data.shape)

# BEST approach (faster and more options such as missing data)
data = np.genfromtxt(fname, delimiter=",", dtype=np.float32)
print(data.shape)

(4601, 58)
(4601, 58)


In [21]:
# Using pandas
df = pd.read_csv(fname, delimiter=",")  # In the default use, pandas tries to form a header, so the nrow will be one less
data = df.to_numpy()
print(data.shape)

# 
df = pd.read_csv(fname, delimiter=",", header=None)
data = df.to_numpy()
print(data.shape)

(4600, 58)
(4601, 58)


In [14]:
# specifying data type in advance

df = pd.read_csv(fname, delimiter=",", header=None, dtype=np.float32)
data = df.to_numpy()
print(data.shape)
print(type(data[0][0]))

(4601, 58)
<class 'numpy.float32'>


In [23]:
# converting dtype after
df = pd.read_csv(fname, delimiter=",", header=None)
data = df.to_numpy()
data = np.asarray(data, dtype=np.float32)
print(data.shape)
print(type(data[0][0]))

(4601, 58)
<class 'numpy.float32'>


In [16]:
# If data rows have headers
data = np.genfromtxt(fname, delimiter=",", dtype=np.float32, skip_header=1) # assuming first row is the header
df = pd.read_csv(fname, delimiter=",", header=None, dtype=np.float32, skiprows=1)

In [22]:
# Data set with missing values
# In this case we will have 'nan' for missing cells


data = np.genfromtxt(fname, delimiter=",", dtype=np.float32, skip_header=1, missing_values="Hello", filling_values=9999.0)

df = pd.read_csv(fname, delimiter=",", header=None, dtype=np.float32, skiprows=1, na_values = ["Hello"])
df = df.fillna(999.0)