In [1]:
import pandas as pd

## Import

### CSV

In [None]:
# standard
data_load = pd.read_csv("file.csv", sep = '\t')

In [None]:
# columns
data_load = pd.read_csv("file.csv", sep = '\t', usecols = ['col1', 'col2'])
data_load = pd.read_csv("file.csv", sep = '\t', usecols = [0,1])

In [None]:
# numbers of rows
data_load = pd.read_csv("file.csv", sep = '\t', nrows = 1000, header = None)
data_load = pd.read_csv("file.csv", sep = '\t', skiprows = 1000, header = None)

In [None]:
# add column names
col_names = list(data)
data_load = pd.read_csv("file.csv", sep = '\t', header = None, names = col_names)

In [None]:
# check data type
data.dtypes
data_load = pd.read_csv("file.csv", dtype = {"col1" : str})

In [None]:
# missing data
data_load = pd.read_csv("file.csv", na_values = {'col1' : 0})
print(data_load[data_load.col1.isna()])

In [None]:
# error lines
data_load = pd.read_csv("file.csv", error_bad_lines = False, warn_bad_lines = True)

### Excel

In [None]:
# standard
data_load = pd.read_excel("file.xlsx")
data_load = pd.read_excel("file.xlsx", sheet_name = 'name')
data_load = pd.read_excel("file.xlsx", sheet_name = 1)

In [None]:
# columns
data_load = pd.read_csv("file.xlsx", usecols = ['col1', 'col2'])
data_load = pd.read_csv("file.xlsx", usecols = [0,1])
data_load = pd.read_csv("file.xlsx", usecols = ["A:P, R"])

In [None]:
# rows
data_load = pd.read_excel("file.xlsx", nrows = 1000)
data_load = pd.read_excel("file.xlsx", skiprows = 1000)

In [None]:
# boolean values
data_load = pd.read_excel("file.xlsx", true_values = ['yes'], false_values = ['no'])
print(bool_data.sum())

In [None]:
# parsing dates (option 1)
data_load = pd.read_excel("file.xlsx", parse_dates = ['col1', 'col2'])

In [None]:
# parsing dates (option 2)
data_load = pd.to_datetime(data_load['col_date'], format = '%m%d%Y %H:%M:%S')

### Databases

SQLAlchemy is a database engine, which is a library which has tools to work with many major ralation databases.

In [None]:
# step 1: connect to database
from sqlalchemy import create_engine

# create database engine to manage connections
engine = create_engine('sqlite:///filename.db')

# load entire table by table name
table = pd.read_sql(query = 'weather', engine = engine)
table = pd.read_sql(query = 'SELECT * FROM weather', engine = engine) 

In [None]:
# step 2: query database

### JSON

In [None]:
# standard
data_load = read_json("file.json", orient = 'split')

### API

In [None]:
import requests
from pandas.io.json import json_normalize

In [None]:
# get data from URL
api_url = requests.get(url_string, params, headers)

In [None]:
# from dict to string in order to load
params = {'term': 'bookstore', 'location' : 'Rotterdam'}
headers = {'authorization': 'EMC {}'.format(api_key)}

In [None]:
# Get data about NYC cafes from the Yelp API
response = requests.get(api_url, params = params, headers = headers)

# Extract JSON data from the response - returns dictionary
data = response.json()

In [None]:
# Extract JSON data from the response
bookstores = pd.DataFrame(data['businesses'])

In [None]:
# Flatten data and load into dataframe
bookstores = json_normalize(data['businesses'], sep = '_')
bookstores = json_normalize(data['businesses']
                           , sep = '_'
                           , record_path = 'categories'
                           , meta = [
                                'name'
                                , 'alias'
                                , 'rating'
                                , ['coordinates', 'latitude']
                                , ['coordinates', 'longitude']
                           ]
                           , meta_prefix = 'biz_'
                           , params['offset'] = 20)

print(list(bookstores))
print(bookstores.categories) # still nested (=deeply nested)