## Dataset After Log Transformation - as per client's requirement
### Real Oil Prices not transformed

In [17]:
import pandas as pd
import numpy as np

# read the combined dataset
df = pd.read_csv('Combined_Dataset_After_CPI.csv')

# list of percentage columns provided by client
percentages = ['UNRATE', 'FEDFUNDS', 'CP3Mx', 'TB3MS', 'TB6MS', 'GS1', 'GS5', 'GS10', 'AAA', 'BAA', 'COMPAPFF', 'TB3SMFFM', 'TB6SMFFM', 'T1YFFM', 'T5YFFM', 'T10YFFM', 'AAAFFM', 'BAAFFM']

# exclude 'date' column, Real Oil Prices and percentage columns
excluded_columns = percentages + ['date'] + ['Real Oil Prices']

# apply log transformation to all other numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.difference(excluded_columns)

# Check for zero values
zero_values_count = (df[numeric_columns] == 0).sum()
print(f'Total zero values: {zero_values_count.sum()}')

# Check for negative values
negative_values_count = (df[numeric_columns] < 0).sum()
print(f'Total negative values: {negative_values_count.sum()}')

# apply log transformation using numpy's log function
# adding a small number to avoid log(0) which is undefined
small_constant = 1e-6
# applymap() applies the function to each element of the dataframe
df[numeric_columns] = df[numeric_columns].applymap(lambda x: np.log(x + small_constant) if x > 0 else np.nan)

# Check the transformation
print(df[numeric_columns].head())

Total zero values: 10
Total negative values: 99
   ACOGNO    AMDMNOx    AMDMUOx    ANDENOx    AWHMAN   AWOTMAN   BOGMBASE  \
0     NaN  11.520235  12.788297  10.247722  3.706228  1.223776  12.309080   
1     NaN  11.500043  12.794584  10.288905  3.706228  1.193923  12.291332   
2     NaN  11.489681  12.803461  10.247763  3.706228  1.223776  12.307726   
3     NaN  11.471084  12.797040  10.199538  3.701302  1.193923  12.324300   
4     NaN  11.460753  12.793347  10.205587  3.706228  1.223776  12.330059   

     BUSINVx  BUSLOANS     CE16OV  ...    USGOVT     USTPU   USTRADE  \
0  13.380836  6.215384  11.598066  ...  9.724062  9.931395  9.384235   
1  13.381858  6.215983  11.594321  ...  9.726691  9.931540  9.384840   
2  13.385085  6.225226  11.597607  ...  9.726452  9.933483  9.389657   
3  13.385864  6.230038  11.598663  ...  9.727049  9.935519  9.392187   
4  13.382745  6.230387  11.599919  ...  9.728539  9.938420  9.395965   

   USWTRADE   VIXCLSx   W875RX1  WPSFD49207  WPSFD49502 

### Seeing what is the smallest non-zero value in the dataset

In [18]:
# Assuming df is your DataFrame and numeric_columns contains the columns you want to transform
smallest_nonzero = df[numeric_columns][df[numeric_columns] > 0].min()

print("Smallest non-zero values in each column:")
print(smallest_nonzero)
# if any values are close to 1e-6, then we'd consider using a smaller constant, in this case we're good

# Save the transformed dataset
df.to_csv('Combined_Dataset_After_Log_Transform_With_Real_Oil_NOT_Transformed.csv', index=False)

Smallest non-zero values in each column:
ACOGNO        11.367264
AMDMNOx       11.453453
AMDMUOx       12.783054
ANDENOx       10.188601
AWHMAN         3.648057
                ...    
W875RX1        8.688117
WPSFD49207     4.627910
WPSFD49502     4.607168
WPSID61        4.584967
WPSID62        4.449685
Length: 110, dtype: float64


### Statistical data of the real oil price dataset after log transformation

In [19]:
# Descriptive statistics of real oil price
print("Descriptive Statistics for 'Real Oil Prices':")
print(df['Real Oil Prices'].describe())

Descriptive Statistics for 'Real Oil Prices':
count    450.000000
mean       3.004940
std        0.459787
min        1.866020
25%        2.652537
50%        2.922401
75%        3.375113
max        4.120085
Name: Real Oil Prices, dtype: float64


### Count Missing Values (blanks and NaNs) in the dataset

In [20]:
missing_values_count = df.isnull().sum() # if missing or NaN values exist, they will be counted and return True
total_missing_values = missing_values_count.sum()
print(f'Total missing values: {total_missing_values}')
print(missing_values_count) 

Total missing values: 199
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          1
                  ..
DTCOLNVHFNM        1
DTCTHFNM           1
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 128, dtype: int64


### Count Zeros

In [22]:
zero_values_count = (df == 0).astype(int).sum(axis=0)
total_zero_values = zero_values_count.sum()
print(f'Total zero values: {total_zero_values}')
print(zero_values_count)

Total zero values: 36
date               0
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 128, dtype: int64


### Count Negative Values

In [24]:
# Select only the numeric columns from DataFrame
numeric_df = df.select_dtypes(include=['number'])

negative_values_count = (numeric_df < 0).sum()
total_negative_values = negative_values_count.sum()
print(f'Total negative values: {total_negative_values}')
print(negative_values_count)

Total negative values: 2281
RPI                0
W875RX1            0
DPCERA3M086SBEA    0
CMRMTSPLx          0
RETAILx            0
                  ..
DTCOLNVHFNM        0
DTCTHFNM           0
INVEST             0
VIXCLSx            0
Real Oil Prices    0
Length: 127, dtype: int64
