## Combined_Log_Excl_Roil_Clean_NoNeg.csv
### Dataset derived from `Combined_Log_Transformed_Excl_Roil_Clean.csv`
#### All blanks removed and negative values removed

In [3]:
import pandas as pd

# read the combined dataset
df = pd.read_csv('./CSV/Combined_Log_Excl_Roil_Clean.csv')

# 1. Drop columns that contain 50% or more negatives, excluding the first column (index)
cols_to_drop = [col for col in df.columns[1:] if df[col].dtype in ['float64', 'int64'] and (df[col] < 0).mean() >= 0.5]
df = df.drop(columns=cols_to_drop)

# 2. Drop rows that contain any negatives excluding the first column
rows_with_neg = df[(df[df.columns[1:]].select_dtypes(['float64', 'int64']) < 0).any(axis=1)].index
df = df.drop(index=rows_with_neg)

# save to csv
df.to_csv('./CSV/Combined_Log_Excl_Roil_Clean_NoNeg.csv', index=False)

df.head()

Unnamed: 0,date,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST,VIXCLSx,Real Oil Prices
1,1992-03,8.986886,8.839001,4.024816,13.54428,12.005919,4.13435,4.262926,4.237888,4.394678,...,4.065979,2.478218,2.621766,2.426571,4.330733,11.050763,12.159442,6.619469,2.786412,13.601725
2,1992-04,8.990617,8.841752,4.025655,13.550922,12.011936,4.142042,4.270416,4.245491,4.404083,...,4.069727,2.481568,2.622492,2.430099,4.346399,11.071004,12.159567,6.628021,2.784301,14.512195
4,1992-06,9.000118,8.851692,4.033497,13.55267,12.020207,4.145887,4.273481,4.248251,4.405713,...,4.072355,2.48574,2.630449,2.43449,4.387014,11.051223,12.169043,6.652466,2.691453,15.981442
5,1992-07,8.999322,8.850346,4.036839,13.565973,12.027287,4.154825,4.283439,4.259519,4.418656,...,4.076435,2.484907,2.623218,2.436242,4.338597,11.06062,12.169643,6.666419,2.588035,15.501779
9,1992-11,8.996615,8.847863,4.05088,13.569488,12.049436,4.163143,4.292269,4.269875,4.429636,...,4.087739,2.491551,2.630449,2.441477,4.446174,11.042298,12.193706,6.705386,2.680611,14.313863


### Row and column information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97 entries, 1 to 259
Columns: 124 entries, date to Real Oil Prices
dtypes: float64(123), object(1)
memory usage: 94.7+ KB


### Statistical data of the real oil price dataset after log transformation

In [5]:
# Descriptive statistics of real oil price
print("Descriptive Statistics for 'Real Oil Prices':")
print(df['Real Oil Prices'].describe())

Descriptive Statistics for 'Real Oil Prices':
count    97.000000
mean     21.242248
std       8.625632
min       9.924812
25%      13.601725
50%      19.610459
75%      29.439677
max      38.771314
Name: Real Oil Prices, dtype: float64


### Count Missing Values (blanks and NaNs) in the dataset

In [6]:
missing_values_count = df.isnull().sum() # if missing or NaN values exist, they will be counted and return True
total_missing_values = missing_values_count.sum()
print(f'Total missing values: {total_missing_values}')
print(missing_values_count) 

Total missing values: 0
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 124, dtype: int64


### Count Zeros

In [7]:
zero_values_count = (df == 0).astype(int).sum(axis=0)
total_zero_values = zero_values_count.sum()
print(f'Total zero values: {total_zero_values}')
print(zero_values_count)

Total zero values: 6
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 124, dtype: int64


### Count Negative Values

In [8]:
# Select only the numeric columns from DataFrame
numeric_df = df.select_dtypes(include=['number'])

negative_values_count = (numeric_df < 0).sum()
total_negative_values = negative_values_count.sum()
print(f'Total negative values: {total_negative_values}')
print(negative_values_count)

Total negative values: 0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
RETAILx            0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 123, dtype: int64
