In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import date

In [2]:
pd.__version__

'1.4.1'

In [3]:
df = pd.DataFrame()

In [4]:
df = pd.read_csv("house_price.csv", nrows=100)

## Reading data

useful `read_csv` parameters

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [None]:
pd.read_csv("data.csv")

In [None]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
pd.read_csv("data.csv", usecols=["date", "price"])

In [None]:
pd.read_csv("data.csv", parse_dates=["date"])

In [None]:
pd.read_csv("data.csv", dtype={"house_type": "category"})

In [None]:
pd.read_csv("data.csv", index_col="date")

In [None]:
pd.read_csv("data.csv", skiprows=[1, 5])  # skips line 1 and 5
pd.read_csv("data.csv", skiprows=100)  # skips the first 100 lines
pd.read_csv("data.csv", skiprows=lambda x: x > 0 and np.random.rand() > 0.1) # skip 90% of the rows


In [None]:
pd.read_csv("data.csv", nrows=100)

In [None]:
pd.read_csv("data.csv", na_values=["?"])

In [None]:
pd.read_csv("data.csv", true_values=["yes"], false_values=["no"])

Read from multiple files

In [None]:
import glob
import os

files = glob.glob("file_*.csv")

result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)


copy from excel, read into pandas data frame

In [None]:
df = pd.read_clipboard() 

read tables from pdf

https://github.com/chezou/tabula-py

In [None]:
# %pip install tabula-py

from tabula import read_pdf
# Read pdf into list of DataFrame
df = read_pdf('test.pdf', pages='all')

## EDA cheat

In [None]:
# %pip install pandas-profiling

import pandas_profiling

df = pd.read_csv("data.csv")
profile = df.profile_report(title="Pandas Profiling Report")
profile.to_file(output_file="output.html")

## Data Types

filtering

In [None]:
# selecting
df.select_dtypes(include="number")
df.select_dtypes(include=["category", "datetime"])

# exluding
df.select_dtypes(exclude="object")

conversions

In [None]:
df.infer_objects().dtypes

In [None]:
pd.to_numeric(df.numeric_col, downcast="integer") # smallest signed int dtype
pd.to_numeric(df.numeric_col, downcast="float")  # smallest float dtype

In [None]:
# apply to whole data frame
df = df.apply(pd.to_numeric, errors="coerce")

# apply to specific columns
pd.to_numeric(df.numeric_column, errors="coerce")

# filling NA values with zero
pd.to_numeric(df.numeric_column, errors="coerce").fillna(0)


converting all columns at once

In [None]:
df = df.astype(
    {
        "date": "datetime64[ns]",
        "price": "int",
        "is_weekend": "bool",
        "status": "category",
    }
)

## Column operations

rename columns

In [None]:
df = df.rename({"PRICE": "price", "Date (mm/dd/yyyy)": "date"}, axis=1)

In [None]:
df.add_prefix("pre_")
df.add_suffix("_suf")

create new columns

In [None]:
# create new column of Fahrenheit values from Celcius
df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

In [None]:
random_col = np.random.randint(10, size=len(df))
df.insert(3, 'random_col', random_col) # inserts at third column

if-then-else using NumPy’s where()

In [None]:
df["logic"] = np.where(df["price"] > 5, "high", "low")

dropping columns

In [None]:
df.drop('col1', axis=1, inplace=True)
df = df.drop(['col1','col2'], axis=1)
s = df.pop('col')
del df['col']
df.drop(df.columns[0], inplace=True)

## String operations

In [None]:
# on column names
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

In [None]:
df['name'].str.contains("John")

df['phone_num'].str.contains('...-...-....', regex=True)  # regex

df['email'].str.contains('gmail')

In [None]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

df['email'].str.findall(pattern, flags=re.IGNORECASE)

remove non-numeric values throughout df

In [None]:
for col in df.columns.values:
    df[col] = df[col].replace("[^0-9]+.-", "", regex=True)

## Missing values

In [5]:
def missing_vals(df):
    """prints out columns with perc of missing values"""
    missing = [
        (df.columns[idx], perc)
        for idx, perc in enumerate(df.isna().mean() * 100)
        if perc > 0
    ]

    if len(missing) == 0:
        return "no missing values"
        

    # sort desc by perc
    missing.sort(key=lambda x: x[1], reverse=True)

    print(f"There are a total of {len(missing)} variables with missing values\n")

    for tup in missing:
        print(str.ljust(f"{tup[0]:<20} => {round(tup[1], 3)}%", 1))


missing_vals(df)

There are a total of 16 variables with missing values

PoolQC               => 100.0%
Alley                => 94.0%
MiscFeature          => 91.0%
Fence                => 77.0%
FireplaceQu          => 54.0%
LotFrontage          => 14.0%
GarageType           => 6.0%
GarageYrBlt          => 6.0%
GarageFinish         => 6.0%
GarageQual           => 6.0%
GarageCond           => 6.0%
BsmtQual             => 3.0%
BsmtCond             => 3.0%
BsmtExposure         => 3.0%
BsmtFinType1         => 3.0%
BsmtFinType2         => 3.0%


In [6]:
def missing_cols(df):
    '''prints out columns with its amount of missing values with its %'''
    total = 0
    for col in df.columns:
        missing_vals = df[col].isnull().sum()
        pct = df[col].isna().mean() * 100
        total += missing_vals
        if missing_vals != 0:
          print('{} => {} [{}%]'.format(col, df[col].isnull().sum(), round(pct, 2)))
    
    if total == 0:
        print("no missing values")

In [7]:
missing_cols(df)

LotFrontage => 14 [14.0%]
Alley => 94 [94.0%]
BsmtQual => 3 [3.0%]
BsmtCond => 3 [3.0%]
BsmtExposure => 3 [3.0%]
BsmtFinType1 => 3 [3.0%]
BsmtFinType2 => 3 [3.0%]
FireplaceQu => 54 [54.0%]
GarageType => 6 [6.0%]
GarageYrBlt => 6 [6.0%]
GarageFinish => 6 [6.0%]
GarageQual => 6 [6.0%]
GarageCond => 6 [6.0%]
PoolQC => 100 [100.0%]
Fence => 77 [77.0%]
MiscFeature => 91 [91.0%]


dropping

imputing

In [None]:
# drop 
df.dropna(axis=0)
df.dropna(axis=1)

# impute
df.fillna(0)
df.fillna(method="ffill")
df.fillna(method='bfill')

# replace
df.replace( -999, np.nan)
df.replace("?", np.nan)

# interpolate
ts.interpolate() # time series
df.interpolate() # fill all consecutive values forward
df.interpolate(limit=1) # fill one consecutive value forward
df.interpolate(limit=1, limit_direction="backward")
df.interpolate(limit_direction="both")

## Date operations

In [None]:
df = pd.read_csv("data/tesla.csv", usecols=["Date", "Open", "Close", "Volume"], parse_dates=['Date'])
# https://www.kaggle.com/datasets/rpaguirre/tesla-stock-price

Get X hours/days/weeks from today / ago

In [None]:
# from today
date.today() + datetime.timedelta(hours=30)
date.today() + datetime.timedelta(days=30)
date.today() + datetime.timedelta(weeks=30)

# ago
date.today() - datetime.timedelta(days=365)

datetime.date(2021, 4, 18)

filter between two dates

In [None]:
df[(df["Date"] > "2015-01-01") & (df["Date"] < "2017-01-01")]

Unnamed: 0,Date,Open,Close,Volume
1136,2015-01-02,222.869995,219.309998,4764400
1137,2015-01-05,214.550003,210.089996,5368500
1138,2015-01-06,210.059998,211.279999,6261900
1139,2015-01-07,213.350006,210.949997,2968400
1140,2015-01-08,212.809998,210.619995,3442500
...,...,...,...,...
1635,2016-12-23,208.000000,213.339996,4670500
1636,2016-12-27,214.880005,219.529999,5915700
1637,2016-12-28,221.529999,219.740005,3782500
1638,2016-12-29,218.559998,214.679993,4045000


In [None]:
# filter by single day
df[df["Date"].dt.strftime("%Y-%m-%d") == "2017-03-01"]

# filter by single month
df[df["Date"].dt.strftime("%m") == "12"]

# filter by single year
df[df["Date"].dt.strftime("%Y") == "2017"]

Unnamed: 0,Date,Open,Close,Volume
1679,2017-03-01,254.179993,250.020004,4800300


Check out https://strftime.org/ for date string format in Python

## Styling data frames

https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html

In [None]:
df = df.tail(10)

In [None]:
format_dict = {
    "Date": "{:%d/%m/%y}",
    "Open": "${:.2f}",
    "Close": "${:.2f}",
    "Volume": "{:,}",
}

df.style.format(format_dict)


Unnamed: 0,Date,Open,Close,Volume
1681,03/03/17,$250.74,$251.57,2919400
1686,10/03/17,$246.21,$243.69,3057000
1667,10/02/17,$269.79,$269.23,3619700
1642,05/01/17,$226.42,$226.75,5911700
1645,10/01/17,$232.00,$229.87,3660000
1655,25/01/17,$257.31,$254.47,5142600
1643,06/01/17,$226.93,$229.01,5527900
1666,09/02/17,$266.25,$269.20,7820200
1662,03/02/17,$251.91,$251.33,2186700
1679,01/03/17,$254.18,$250.02,4800300


In [None]:
(
    df.style.format(format_dict)
    .hide_index()
    .highlight_min(["Open"], color="red")
    .highlight_max(["Open"], color="green")
    .background_gradient(subset="Close", cmap="Greens")
    .bar('Volume', color='lightblue', align='zero')
    .set_caption('Tesla Stock Prices in 2017')
)

Date,Open,Close,Volume
03/03/17,$250.74,$251.57,2919400
10/03/17,$246.21,$243.69,3057000
10/02/17,$269.79,$269.23,3619700
05/01/17,$226.42,$226.75,5911700
10/01/17,$232.00,$229.87,3660000
25/01/17,$257.31,$254.47,5142600
06/01/17,$226.93,$229.01,5527900
09/02/17,$266.25,$269.20,7820200
03/02/17,$251.91,$251.33,2186700
01/03/17,$254.18,$250.02,4800300


## Misc

In [None]:
df['col'].idxmin()
df['col'].idxmax()

apply function to entire data frame

In [None]:
df.applymap(lambda x: np.log(x))

shuffle data frame

In [None]:
df.sample(frac=1, random_state=7).reset_index(drop=True)

Percent changes

ex: price of BTC over 3 days
[30000, 33000, 31000] -> [NaN, 0.1, -0.06]

In [None]:
df['col_name'].pct_change()

Assign ranks to values

In [None]:
df['rank'] = df['column_to_rank'].rank()

Explode list to rows

In [None]:
df.explode("col_name").reset_index(drop=True)

Check Memory

In [8]:
df.memory_usage().sum() / (1024**2) #converting to MB

0.061920166015625

In [None]:
df = pd.read_csv("data/house_price.csv", nrows=100)

In [None]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [None]:
subclass = df.MSSubClass
subclass.value_counts()

20     45
60     18
50      9
120     5
30      4
90      4
190     3
70      3
160     2
45      2
80      2
180     1
75      1
85      1
Name: MSSubClass, dtype: int64

In [None]:
top_five = subclass.value_counts().nlargest(5).index
mssubclass_new = subclass.where(subclass.isin(top_five), other="Other")
mssubclass_new.value_counts()

20       45
Other    19
60       18
50        9
120       5
30        4
Name: MSSubClass, dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a605a3e6-1564-47b2-94e7-842290ba7692' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>