# Import

In [1]:
import pandas as pd

# Read Data

## Treasury Yield Data
1. Download treasury yield data from https://www.treasury.gov/resource-center/data-chart-center/interest-rates/pages/textview.aspx?data=yield in `XML` format
2. Import the `XML` file into Excel and save as `raw_daily_treasury_yield.xlsx`
3. Remove irrelevant columns and save as **`treasury_yield.csv`**

In [2]:
treasury = pd.read_csv("treasury_yield.csv", parse_dates=["date"])
treasury = treasury.sort_values("date").reset_index(drop=True)
treasury.head()

Unnamed: 0,date,3month,6month,1year,2year,3year,5year,7year,10year,20year,30year
0,1990-01-02,7.83,7.89,7.81,7.87,7.9,7.87,7.98,7.94,,8.0
1,1990-01-03,7.89,7.94,7.85,7.94,7.96,7.92,8.04,7.99,,8.04
2,1990-01-04,7.84,7.9,7.82,7.92,7.93,7.91,8.02,7.98,,8.04
3,1990-01-05,7.79,7.85,7.79,7.9,7.94,7.92,8.03,7.99,,8.06
4,1990-01-08,7.79,7.88,7.81,7.9,7.95,7.92,8.05,8.02,,8.09


In [3]:
# quick check the null values
treasury.isnull().sum()

date        0
3month      4
6month      1
1year       1
2year       1
3year       1
5year       1
7year       1
10year      1
20year    940
30year    995
dtype: int64

## Macro Data
1. Go to the site: https://apps.bea.gov/iTable/iTable.cfm?reqid=19&step=2#reqid=19&step=2&isuri=1&1921=survey
2. From Section1 to Section5, download necessary tables
3. Since the data are in quarterly frequency, combine all the tables with `year` and `quarter` into **`macro_data.csv`**

In [4]:
macro = pd.read_csv("macro_data.csv")
macro = macro.sort_values(["year", "quarter"]).reset_index(drop=True)
macro.head()

Unnamed: 0,year,quarter,gdp,government_consumption_expenditures_and_gross_investment,personal_income,government_current_receipts,government_current_expenditures,exports_of_goods_and_services,imports_of_goods_and_services
0,1990,1,5872.7,1212.5,4818.9,1677.6,1901.2,538.2,626.8
1,1990,2,5960.0,1230.7,4899.2,1698.4,1938.8,545.9,614.8
2,1990,3,6015.1,1242.6,4958.5,1727.6,1965.6,555.1,630.1
3,1990,4,6004.7,1268.5,4978.6,1749.7,2005.9,568.2,647.3
4,1991,1,6035.2,1284.2,4992.5,1735.0,1975.8,573.2,620.3


# Data Transformation
Based on the null values existing in the dataset, we decided to use `3month` rate, which is more representative

## Quarterly Mean on Treasury Yield

In [5]:
daily_yield = treasury[["date", "3month"]].copy()

# extract year and month from the date
daily_yield['year'] = [i.year for i in daily_yield.date]
daily_yield['month'] = [i.month for i in daily_yield.date]

In [6]:
# calculate the monthly mean
monthly_yield = daily_yield.groupby(['year', 'month']).mean().reset_index()
monthly_yield.head()

Unnamed: 0,year,month,3month
0,1990,1,7.898095
1,1990,2,8.002105
2,1990,3,8.17
3,1990,4,8.0405
4,1990,5,8.006818


In [7]:
def month_quarter(x):
    """
    map month to quarter
    """
    if x in [1, 2, 3]:
        return 1
    elif x in [4, 5, 6]:
        return 2
    elif x in [7, 8, 9]:
        return 3
    elif x in [10, 11, 12]:
        return 4
    else:
        raise Exception ("Month value is out of range!")

In [8]:
monthly_yield["quarter"] = monthly_yield["month"].apply(lambda x: month_quarter(x))

# calculate the quarterly mean
quarterly_yield = monthly_yield[["year", "quarter", "3month"]].groupby(['year','quarter']).mean().reset_index()
quarterly_yield.head()

Unnamed: 0,year,quarter,3month
0,1990,1,8.0234
1,1990,2,8.011328
2,1990,3,7.722335
3,1990,4,7.213333
4,1991,1,6.206764


## Merge Treasury Yield with Macro Data

In [9]:
data = macro.merge(quarterly_yield, how="left", on=["year", "quarter"])
data = data.sort_values(["year", "quarter"]).reset_index(drop=True)
data.head()

Unnamed: 0,year,quarter,gdp,government_consumption_expenditures_and_gross_investment,personal_income,government_current_receipts,government_current_expenditures,exports_of_goods_and_services,imports_of_goods_and_services,3month
0,1990,1,5872.7,1212.5,4818.9,1677.6,1901.2,538.2,626.8,8.0234
1,1990,2,5960.0,1230.7,4899.2,1698.4,1938.8,545.9,614.8,8.011328
2,1990,3,6015.1,1242.6,4958.5,1727.6,1965.6,555.1,630.1,7.722335
3,1990,4,6004.7,1268.5,4978.6,1749.7,2005.9,568.2,647.3,7.213333
4,1991,1,6035.2,1284.2,4992.5,1735.0,1975.8,573.2,620.3,6.206764


In [10]:
# the latest quarter macro data are all null values
# after merging the yield data, they are still presented
data.isnull().sum()

year                                                        0
quarter                                                     0
gdp                                                         1
government_consumption_expenditures_and_gross_investment    1
personal_income                                             1
government_current_receipts                                 1
government_current_expenditures                             1
exports_of_goods_and_services                               1
imports_of_goods_and_services                               1
3month                                                      0
dtype: int64

## Shift Treasury Yield Up 1 Row

In [11]:
# by shifting 1 row, the data is structured to use past information to predict the future
data["next_quarter_3mont_yield"] = data["3month"].shift(-1)
final_data = data.dropna().copy()
final_data.isnull().sum()

year                                                        0
quarter                                                     0
gdp                                                         0
government_consumption_expenditures_and_gross_investment    0
personal_income                                             0
government_current_receipts                                 0
government_current_expenditures                             0
exports_of_goods_and_services                               0
imports_of_goods_and_services                               0
3month                                                      0
next_quarter_3mont_yield                                    0
dtype: int64

## Create Appropriate `date` Column

In [12]:
def find_date(x):
    """
    assign the first date of the next quarter
    """
    if x["quarter"] == 1:
        return str(int(x["year"])) + "-04-01"
    elif x["quarter"] == 2:
        return str(int(x["year"])) + "-07-01"
    elif x["quarter"] == 3:
        return str(int(x["year"])) + "-10-01"
    elif x["quarter"] == 4:
        return str(int(x["year"] + 1)) + "-01-01"
    else:
        raise Exception("Quarter value is out of range")

In [13]:
final_data["date"] = final_data.apply(lambda x: find_date(x), axis=1)
final_data["date"] = pd.to_datetime(final_data["date"])

In [14]:
# reorganize the columns
final_data = final_data[["date"] + list(final_data.columns[2:-1])]
del final_data["3month"]

In [15]:
final_data.head()

Unnamed: 0,date,gdp,government_consumption_expenditures_and_gross_investment,personal_income,government_current_receipts,government_current_expenditures,exports_of_goods_and_services,imports_of_goods_and_services,next_quarter_3mont_yield
0,1990-04-01,5872.7,1212.5,4818.9,1677.6,1901.2,538.2,626.8,8.011328
1,1990-07-01,5960.0,1230.7,4899.2,1698.4,1938.8,545.9,614.8,7.722335
2,1990-10-01,6015.1,1242.6,4958.5,1727.6,1965.6,555.1,630.1,7.213333
3,1991-01-01,6004.7,1268.5,4978.6,1749.7,2005.9,568.2,647.3,6.206764
4,1991-04-01,6035.2,1284.2,4992.5,1735.0,1975.8,573.2,620.3,5.736697


In [16]:
final_data.to_csv("macro_showcase_finalized.csv", index=False)