## Capstone Project
### Subject: US Economic Indicator
### Author: Ja`Mone Bridges
### Notebook Purpose: EDA and Cleaning Dataset and Storing

In [None]:
import os
import pandas as pd
import numpy as np
import math

# Plotting
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
# import seaborn as sns

# SQL database
from sqlalchemy import create_engine
import re

In [20]:
# Function to test if a string is a number
def is_float(string):
    try:
        test_float = float(string)
        if math.isnan(test_float) == True:
            return False
        return True
    except ValueError:
        return False

# Function to test if a string in a integer
def is_int(string):
    try:
        int(string)
        return True
    except ValueError:
        return False

# Function to get all the files in a directory and put them into a list
def file_list(dir_path, extension=".csv"):
    files = []
    # Iterate directory
    for file_path in os.listdir(dir_path):
        # check if current file_path is a file
        if os.path.isfile(os.path.join(dir_path, file_path)) and \
            file_path[-4:] == extension:
            # add filename to list
            files.append(dir_path + "/" + file_path)
    return files


In [21]:
# directory/folder path
national_debt_path = r'data/National_Debt'

# list to store files
files = file_list(national_debt_path)

data_type = ["object", "float", "float", "float", "int32", "int32", "int32", \
            "int32", "int32", "int32", "int32"]

entry = 0

In [22]:
nd_df = pd.read_csv(files[1], header = 0)
# Sanity check
display(nd_df.head())

Unnamed: 0,Record Date,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
0,2024-01-04,26976930000000.0,7029342000000.0,34006270000000.0,1,2024,2,2024,1,1,4
1,2024-01-03,26967950000000.0,7026266000000.0,33994220000000.0,1,2024,2,2024,1,1,3
2,2024-01-02,26966330000000.0,7023794000000.0,33990130000000.0,1,2024,2,2024,1,1,2
3,2023-12-29,26938520000000.0,7062976000000.0,34001490000000.0,1,2024,1,2023,4,12,29
4,2023-12-28,26891920000000.0,7019307000000.0,33911230000000.0,1,2024,1,2023,4,12,28


In [23]:
display(nd_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7717 entries, 0 to 7716
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Record Date                    7717 non-null   object 
 1   Debt Held by the Public        4759 non-null   float64
 2   Intragovernmental Holdings     4759 non-null   float64
 3   Total Public Debt Outstanding  7717 non-null   float64
 4   Source Line Number             7717 non-null   int64  
 5   Fiscal Year                    7717 non-null   int64  
 6   Fiscal Quarter Number          7717 non-null   int64  
 7   Calendar Year                  7717 non-null   int64  
 8   Calendar Quarter Number        7717 non-null   int64  
 9   Calendar Month Number          7717 non-null   int64  
 10  Calendar Day Number            7717 non-null   int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 663.3+ KB


None

In [24]:
# Covert to Date Time
nd_df['Record Date'] = pd.to_datetime(nd_df['Record Date'])
nd_df = nd_df.set_index('Record Date')

# Verify that Record Date is change to TimeDate
nd_df.head()

Unnamed: 0_level_0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
Record Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-01-04,26976930000000.0,7029342000000.0,34006270000000.0,1,2024,2,2024,1,1,4
2024-01-03,26967950000000.0,7026266000000.0,33994220000000.0,1,2024,2,2024,1,1,3
2024-01-02,26966330000000.0,7023794000000.0,33990130000000.0,1,2024,2,2024,1,1,2
2023-12-29,26938520000000.0,7062976000000.0,34001490000000.0,1,2024,1,2023,4,12,29
2023-12-28,26891920000000.0,7019307000000.0,33911230000000.0,1,2024,1,2023,4,12,28


In [25]:
display(nd_df.describe())

Unnamed: 0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
count,4759.0,4759.0,7717.0,7717.0,7717.0,7717.0,7717.0,7717.0,7717.0,7717.0
mean,12979720000000.0,5075082000000.0,13296370000000.0,1.0,2008.385383,2.52028,2008.1371,2.513412,6.54516,15.760658
std,6317983000000.0,983428700000.0,8327878000000.0,0.0,8.888281,1.121799,8.881054,1.106783,3.409717,8.781364
min,3333039000000.0,1623478000000.0,4225874000000.0,1.0,1993.0,1.0,1993.0,1.0,1.0,1.0
25%,7506673000000.0,4370367000000.0,5713302000000.0,1.0,2001.0,2.0,2000.0,2.0,4.0,8.0
50%,12595460000000.0,4997860000000.0,9621537000000.0,1.0,2008.0,3.0,2008.0,3.0,7.0,16.0
75%,16188720000000.0,5841658000000.0,19214130000000.0,1.0,2016.0,4.0,2016.0,3.0,9.0,23.0
max,26976930000000.0,7134005000000.0,34006270000000.0,1.0,2024.0,4.0,2024.0,4.0,12.0,31.0


In [26]:
# Plot lines
plot = px.line(nd_df, x=nd_df.index, y=['Debt Held by the Public', 'Intragovernmental Holdings', 'Total Public Debt Outstanding'])

# Labels for plot
plot.update_layout(
    yaxis_title = 'US Dollars in Trillions',
    legend_title = 'Debt Type',
    title = 'US National Debt'
)

# Enable slider
plot.update_xaxes(rangeslider_visible = True)

plot.show()

The green line is the total US national debt.  An increase in the rate of the debt can be notice at each of the last three recessions. 2001 is the first increase after 9/11 the second increase in 2008 during the housing mortgage crisis, and Covid 19 in 2019.  Also between 2010 and 2020 a slight bump is present when the US government annual budget gets approved.

In [27]:
first_day = nd_df.index.min()
last_day = nd_df.index.max()

first_day, last_day

(Timestamp('1993-04-01 00:00:00'), Timestamp('2024-01-04 00:00:00'))

In [28]:
full_range = pd.date_range(start=first_day, end=last_day, freq="D")

full_range.difference(nd_df.index)

DatetimeIndex(['1993-04-03', '1993-04-04', '1993-04-10', '1993-04-11',
               '1993-04-17', '1993-04-18', '1993-04-24', '1993-04-25',
               '1993-05-01', '1993-05-02',
               ...
               '2023-12-09', '2023-12-10', '2023-12-16', '2023-12-17',
               '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-30',
               '2023-12-31', '2024-01-01'],
              dtype='datetime64[ns]', length=3519, freq=None)

In [29]:
nd_df = nd_df.reindex(full_range)
full_range.difference(nd_df.index)

DatetimeIndex([], dtype='datetime64[ns]', freq='D')

In [30]:
temp_df = nd_df[['Debt Held by the Public', 'Intragovernmental Holdings', 'Total Public Debt Outstanding']]
temp_df = temp_df.interpolate(method='linear', Limit=95)
nd_df[['Debt Held by the Public', 'Intragovernmental Holdings', 'Total Public Debt Outstanding']] = temp_df
display(nd_df.head())
display(nd_df.tail())

Unnamed: 0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
1993-04-01,,,4225874000000.0,1.0,1993.0,3.0,1993.0,2.0,4.0,1.0
1993-04-02,,,4233931000000.0,1.0,1993.0,3.0,1993.0,2.0,4.0,2.0
1993-04-03,,,4234253000000.0,,,,,,,
1993-04-04,,,4234575000000.0,,,,,,,
1993-04-05,,,4234896000000.0,1.0,1993.0,3.0,1993.0,2.0,4.0,5.0


Unnamed: 0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
2023-12-31,26952430000000.0,7043385000000.0,33995810000000.0,,,,,,,
2024-01-01,26959380000000.0,7033589000000.0,33992970000000.0,,,,,,,
2024-01-02,26966330000000.0,7023794000000.0,33990130000000.0,1.0,2024.0,2.0,2024.0,1.0,1.0,2.0
2024-01-03,26967950000000.0,7026266000000.0,33994220000000.0,1.0,2024.0,2.0,2024.0,1.0,1.0,3.0
2024-01-04,26976930000000.0,7029342000000.0,34006270000000.0,1.0,2024.0,2.0,2024.0,1.0,1.0,4.0


In [31]:
nd_df.index.day

Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       ...
       26, 27, 28, 29, 30, 31,  1,  2,  3,  4],
      dtype='int32', length=11236)

In [32]:
nd_df = nd_df.drop(columns='Source Line Number')
nd_df['Record Date'] = nd_df.index
display(nd_df.info())
display(nd_df.head())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11236 entries, 1993-04-01 to 2024-01-04
Freq: D
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Debt Held by the Public        9593 non-null   float64       
 1   Intragovernmental Holdings     9593 non-null   float64       
 2   Total Public Debt Outstanding  11236 non-null  float64       
 3   Fiscal Year                    7717 non-null   float64       
 4   Fiscal Quarter Number          7717 non-null   float64       
 5   Calendar Year                  7717 non-null   float64       
 6   Calendar Quarter Number        7717 non-null   float64       
 7   Calendar Month Number          7717 non-null   float64       
 8   Calendar Day Number            7717 non-null   float64       
 9   Record Date                    11236 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(9)
memory usage: 1.2 MB


None

Unnamed: 0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number,Record Date
1993-04-01,,,4225874000000.0,1993.0,3.0,1993.0,2.0,4.0,1.0,1993-04-01
1993-04-02,,,4233931000000.0,1993.0,3.0,1993.0,2.0,4.0,2.0,1993-04-02
1993-04-03,,,4234253000000.0,,,,,,,1993-04-03
1993-04-04,,,4234575000000.0,,,,,,,1993-04-04
1993-04-05,,,4234896000000.0,1993.0,3.0,1993.0,2.0,4.0,5.0,1993-04-05


In [33]:

nd_df['Fiscal Year'] = pd.DatetimeIndex(nd_df.index).year
nd_df['Calendar Year'] = pd.DatetimeIndex(nd_df.index).year
nd_df['Calendar Month Number'] = pd.DatetimeIndex(nd_df.index).month
nd_df['Calendar Day Number'] = pd.DatetimeIndex(nd_df.index).day

nd_df = nd_df.drop(columns=['Calendar Quarter Number', 'Fiscal Quarter Number', 'Record Date', 'Fiscal Year'])

display(nd_df.head())

nd_df = nd_df.astype({'Calendar Year' : 'int32', 'Calendar Month Number' : 'int32', 'Calendar Day Number' : 'int32'})

Unnamed: 0,Debt Held by the Public,Intragovernmental Holdings,Total Public Debt Outstanding,Calendar Year,Calendar Month Number,Calendar Day Number
1993-04-01,,,4225874000000.0,1993,4,1
1993-04-02,,,4233931000000.0,1993,4,2
1993-04-03,,,4234253000000.0,1993,4,3
1993-04-04,,,4234575000000.0,1993,4,4
1993-04-05,,,4234896000000.0,1993,4,5


In [34]:
nd_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11236 entries, 1993-04-01 to 2024-01-04
Freq: D
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Debt Held by the Public        9593 non-null   float64
 1   Intragovernmental Holdings     9593 non-null   float64
 2   Total Public Debt Outstanding  11236 non-null  float64
 3   Calendar Year                  11236 non-null  int32  
 4   Calendar Month Number          11236 non-null  int32  
 5   Calendar Day Number            11236 non-null  int32  
dtypes: float64(3), int32(3)
memory usage: 740.8 KB


In [35]:

display(nd_df.isna().sum(axis=0))

Debt Held by the Public          1643
Intragovernmental Holdings       1643
Total Public Debt Outstanding       0
Calendar Year                       0
Calendar Month Number               0
Calendar Day Number                 0
dtype: int64

In [36]:
# Engine required to read or store data in a mysql database.
db_engine = create_engine(
    "mysql://{user}:{pw}@localhost/{db}"
    .format(user="root",
    pw="rootroot",
    db="capstone"))

# Store each dataframe in the data base capstone with the table name listed in the call.
nd_df.to_sql(con=db_engine, name='national_debt', if_exists='replace', index=False)

11236