## Combined_Log_Clean_NoNeg.csv
### Dataset derived from `Combined_Log_Clean.csv`
#### All blanks removed and all negative values removed

In [1]:
import pandas as pd

# read the combined dataset
df = pd.read_csv('./CSV/Combined_Log_Clean.csv')

# 1. Drop columns that contain 50% or more negatives, excluding the first column (index)
cols_to_drop = [col for col in df.columns[1:] if df[col].dtype in ['float64', 'int64'] and (df[col] < 0).mean() >= 0.5]
df = df.drop(columns=cols_to_drop)

# 2. Drop rows that contain any negatives excluding the first column
rows_with_neg = df[(df[df.columns[1:]].select_dtypes(['float64', 'int64']) < 0).any(axis=1)].index
df = df.drop(index=rows_with_neg)

# save to csv
df.to_csv('./CSV/Combined_Log_Clean_NoNeg.csv', index=False)


### Row and column information

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97 entries, 1 to 259
Columns: 124 entries, date to Real Oil Prices
dtypes: float64(123), object(1)
memory usage: 94.7+ KB


### Statistical data of the real oil price dataset after log transformation

In [3]:
# Descriptive statistics of real oil price
print("Descriptive Statistics for 'Real Oil Prices':")
print(df['Real Oil Prices'].describe())

Descriptive Statistics for 'Real Oil Prices':
count    97.000000
mean      2.973263
std       0.410823
min       2.295038
25%       2.610197
50%       2.976063
75%       3.382343
max       3.657681
Name: Real Oil Prices, dtype: float64


### Count Missing Values (blanks and NaNs) in the dataset

In [4]:
missing_values_count = df.isnull().sum() # if missing or NaN values exist, they will be counted and return True
total_missing_values = missing_values_count.sum()
print(f'Total missing values: {total_missing_values}')
print(missing_values_count) 

Total missing values: 0
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 124, dtype: int64


### Count Zeros

In [5]:
zero_values_count = (df == 0).astype(int).sum(axis=0)
total_zero_values = zero_values_count.sum()
print(f'Total zero values: {total_zero_values}')
print(zero_values_count)

Total zero values: 6
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 124, dtype: int64


### Count Negative Values

In [6]:
# Select only the numeric columns from DataFrame
numeric_df = df.select_dtypes(include=['number'])

negative_values_count = (numeric_df < 0).sum()
total_negative_values = negative_values_count.sum()
print(f'Total negative values: {total_negative_values}')
print(negative_values_count)

Total negative values: 0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
RETAILx            0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 123, dtype: int64
