In [153]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Bitcoin Price Direction Prediction - Data Preprocessing Pipeline

This notebook preprocesses the Bitcoin sentiment dataset for classification with MyRandomForestClassifier.

In [154]:
import pandas as pd
import numpy as np
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.myevaluation import (
    stratified_kfold_split, 
    confusion_matrix, 
    accuracy_score,
    binary_precision_score, 
    binary_recall_score, 
    binary_f1_score
)


## Step 1: Load and Examine the Dataset

In [155]:
# Load the bitcoin sentiment dataset
df = pd.read_csv('input_data/bitcoin_sentiment.csv')

# Print dataset shape
print("Dataset Shape:")
print(f"  Rows: {df.shape[0]}")
print(f"  Columns: {df.shape[1]}")
print()


# Print headers (column names)
print("Column Headers:")
print(df.columns.tolist())
print()

# Print first few rows
print("First 5 Rows:")
print(df.head())
print()

Dataset Shape:
  Rows: 1074
  Columns: 28

Column Headers:
['Unnamed: 0', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'datetime_utc', 'merge_date', 'domestic_series', 'federal_financing_bank', 'foreign_series', 'government_account_series', 'government_account_series_inflation_securities', 'special_purpose_vehicle', 'state_and_local_government_series', 'total_interest-bearing_debt', 'total_marketable', 'total_non-marketable', 'treasury_bills', 'treasury_bonds', 'treasury_floating_rate_notes_(frn)', 'treasury_inflation-protected_securities_(tips)', 'treasury_notes', 'united_states_savings_inflation_securities', 'united_states_savings_securities', 'weighted_sentiment', 'sentiment_missing']

First 5 Rows:
   Unnamed: 0      timestamp      open      high       low     close  \
0           0  1669852800000  17165.44  17317.80  16855.00  16980.08   
1           1  1669939200000  16980.07  17108.25  16791.02  17094.71   
2           2  1670025600000  17094.25  17158.42  16863.58  16

In [156]:
# Print label distribution 
label_column = df.columns[5]
print(f"Label Distribution ('{label_column}'):")
print(df[label_column])
print()
print("Label Proportions:")
print(df[label_column])

Label Distribution ('close'):
0        16980.08
1        17094.71
2        16888.53
3        17108.90
4        16966.05
          ...    
1069    101468.15
1070    103869.00
1071    101290.50
1072    103284.27
1073    102249.20
Name: close, Length: 1074, dtype: float64

Label Proportions:
0        16980.08
1        17094.71
2        16888.53
3        17108.90
4        16966.05
          ...    
1069    101468.15
1070    103869.00
1071    101290.50
1072    103284.27
1073    102249.20
Name: close, Length: 1074, dtype: float64


In [157]:
# Examine the weighted_sentiment column more closely
print("Weighted Sentiment Statistics:")
print(df['weighted_sentiment'].describe())
print()

# Check for missing values
if df.isnull().values.any():
    print("Missing Values per Column:")
    print(df.isnull().sum())
else:
    print("No missing values found in the dataset.")
print()

print("-" * 70)
print()

# Check sentiment_missing column values for anything other than zero
print("Checking for rows where 'sentiment_missing' != 0:")
if (df['sentiment_missing'] != 0).any():
    print(df[df['sentiment_missing'] != 0])
else:
    print("No rows with sentiment_missing != 0 found.")
print()

Weighted Sentiment Statistics:
count    1074.000000
mean        0.347973
std         0.274657
min        -0.749771
25%         0.171151
50%         0.376796
75%         0.540075
max         0.952912
Name: weighted_sentiment, dtype: float64

No missing values found in the dataset.

----------------------------------------------------------------------

Checking for rows where 'sentiment_missing' != 0:
No rows with sentiment_missing != 0 found.



In [158]:
# Create binary classification label from weighted_sentiment
# Positive sentiment (>0) vs Negative/Neutral sentiment (<=0)
df['sentiment_label'] = df['weighted_sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative/Neutral')

print("Classification Label Distribution:")
print(df['sentiment_label'].value_counts())
print()
print("Label Proportions:")
print(df['sentiment_label'].value_counts(normalize=True))
print()
print(f"Total instances: {len(df)}")

Classification Label Distribution:
sentiment_label
Positive            949
Negative/Neutral    125
Name: count, dtype: int64

Label Proportions:
sentiment_label
Positive            0.883613
Negative/Neutral    0.116387
Name: proportion, dtype: float64

Total instances: 1074


---

## Step 2: Create Classification Label (Price Direction)

Convert the continuous `close` price into a binary classification target by comparing each day's closing price with the previous day.

In [159]:
# Reload the original dataset
df_original = pd.read_csv('input_data/bitcoin_sentiment.csv')

# Create the discretized label by comparing close with previous day's close
# First row will be dropped since there's no previous day
df_original['price_direction'] = 'Down'  # Default value

# Compare current close with previous close
for i in range(1, len(df_original)):
    if df_original.loc[i, 'close'] > df_original.loc[i-1, 'close']:
        df_original.loc[i, 'price_direction'] = 'Up'
    else:
        df_original.loc[i, 'price_direction'] = 'Down'

# Remove the first row (no previous day to compare)
df_discretized = df_original.iloc[1:].copy()
df_discretized = df_discretized.reset_index(drop=True)

print("Discretized Label Distribution (price_direction):")
print(df_discretized['price_direction'].value_counts())
print()
print("Label Proportions:")
print(df_discretized['price_direction'].value_counts(normalize=True))
print()
print(f"Total instances after discretization: {len(df_discretized)}")
print(f"(Original: {len(df_original)}, Removed first row: 1)")

Discretized Label Distribution (price_direction):
price_direction
Up      543
Down    530
Name: count, dtype: int64

Label Proportions:
price_direction
Up      0.506058
Down    0.493942
Name: proportion, dtype: float64

Total instances after discretization: 1073
(Original: 1074, Removed first row: 1)


## Step 3: Drop Unnecessary Features

Remove temporal columns, identifiers, and constant features that provide no predictive value.

In [160]:
# Drop unnecessary columns (temporal, IDs, and constant features)
columns_to_drop = ['Unnamed: 0', 'timestamp', 'datetime_utc', 'merge_date', 'sentiment_missing', 'domestic_series']
df_clean = df_discretized.drop(columns=columns_to_drop)

print("Dataset after dropping unnecessary columns:")
print(f"  Rows: {df_clean.shape[0]}")
print(f"  Columns: {df_clean.shape[1]}")
print(f"  Dropped: {columns_to_drop}")
print()

Dataset after dropping unnecessary columns:
  Rows: 1073
  Columns: 23
  Dropped: ['Unnamed: 0', 'timestamp', 'datetime_utc', 'merge_date', 'sentiment_missing', 'domestic_series']



## Step 4: Normalize Numeric Features

Apply z-score normalization to scale all numeric features to mean=0 and std=1.

In [161]:
# Identify numeric columns (exclude the label column 'price_direction')
numeric_columns = df_clean.select_dtypes(include=[np.number]).columns.tolist()

print(f"Found {len(numeric_columns)} numeric features to normalize")
print()

# Check the scale of numeric features before normalization
print("Feature ranges before normalization (first 5):")
for col in numeric_columns[:5]:
    print(f"  {col}: [{df_clean[col].min():.2f}, {df_clean[col].max():.2f}]")
print(f"  ... and {len(numeric_columns) - 5} more")
print()

Found 22 numeric features to normalize

Feature ranges before normalization (first 5):
  open: [16439.98, 124723.57]
  high: [16621.00, 126296.00]
  low: [16273.40, 123115.77]
  close: [16439.74, 124720.09]
  volume: [1227.77, 65575.10]
  ... and 17 more



### 4.1 Identify Numeric Features

In [162]:
# Apply z-score normalization: (x - mean) / std
# This transforms each feature to have mean=0 and std=1

df_normalized = df_clean.copy()

for col in numeric_columns:
    mean = df_normalized[col].mean()
    std = df_normalized[col].std()
    
    # Avoid division by zero for constant columns
    if std > 0:
        df_normalized[col] = (df_normalized[col] - mean) / std
    else:
        print(f"Warning: {col} has std=0, skipping normalization")

print("✓ Numeric features standardized (z-score normalization)")
print()

# Verify normalization
print("Feature statistics after normalization:")
print(df_normalized[numeric_columns].describe().loc[['mean', 'std']].round(6))
print()

✓ Numeric features standardized (z-score normalization)

Feature statistics after normalization:
      open  high  low  close  volume  federal_financing_bank  foreign_series  \
mean   0.0   0.0  0.0    0.0    -0.0                     0.0             0.0   
std    1.0   1.0  1.0    1.0     1.0                     1.0             1.0   

      government_account_series  \
mean                        0.0   
std                         1.0   

      government_account_series_inflation_securities  special_purpose_vehicle  \
mean                                             0.0                     -0.0   
std                                              1.0                      1.0   

      ...  total_marketable  total_non-marketable  treasury_bills  \
mean  ...              -0.0                  -0.0             0.0   
std   ...               1.0                   1.0             1.0   

      treasury_bonds  treasury_floating_rate_notes_(frn)  \
mean             0.0                        

## Step 5: Discretize Features into Categorical Bins

Convert normalized numeric features into categorical bins for entropy-based decision tree classification.

In [163]:
# Discretize normalized numeric features into categorical bins
# Using quantile-based binning (equal frequency bins)

df_discretized_final = df_normalized.copy()

# Define binning strategy: convert normalized values to 5 categories
# Since normalized data has mean=0, std=1, we can use standard deviations as boundaries
def discretize_normalized_feature(series, n_bins=5):
    """
    Discretize a normalized feature into categorical bins.
    Uses quantile-based binning for equal frequency distribution.
    """
    # Use pandas qcut for quantile-based binning
    bins = pd.qcut(series, q=n_bins, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'], duplicates='drop')
    return bins

print("Discretizing numeric features into 5 categorical bins...")
print("Bins: VeryLow, Low, Medium, High, VeryHigh")
print()

# Track which columns get discretized
discretized_cols = []
skipped_cols = []

for col in numeric_columns:
    try:
        df_discretized_final[col] = discretize_normalized_feature(df_discretized_final[col])
        discretized_cols.append(col)
    except Exception as e:
        # Some columns might have too few unique values to discretize
        print(f"Warning: Could not discretize '{col}': {e}")
        skipped_cols.append(col)

print(f"✓ Successfully discretized {len(discretized_cols)} columns")
if skipped_cols:
    print(f"⚠ Skipped {len(skipped_cols)} columns: {skipped_cols}")
print()

Discretizing numeric features into 5 categorical bins...
Bins: VeryLow, Low, Medium, High, VeryHigh



✓ Successfully discretized 20 columns
⚠ Skipped 2 columns: ['federal_financing_bank', 'foreign_series']



### 5.1 Apply Quantile-Based Binning (5 Equal-Frequency Bins)

In [None]:
# Handle columns that couldn't be discretized (too few unique values)
# Convert them to categorical based on their actual unique values

if skipped_cols:
    print("Handling skipped columns with manual discretization:")
    for col in skipped_cols:
        unique_vals = df_discretized_final[col].nunique()
        print(f"  {col}: {unique_vals} unique values")
        
        # Apply custom discretization rules based on column name
        if col == 'federal_financing_bank':
            # Map 5 unique values directly to Bank1-Bank5 (preserves ordinality)
            sorted_unique = sorted(df_normalized[col].unique())
            bank_mapping = {val: f'Bank{i+1}' for i, val in enumerate(sorted_unique)}
            df_discretized_final[col] = df_normalized[col].map(bank_mapping)
            print(f"    Direct mapping: {len(sorted_unique)} unique values → Bank1-Bank{len(sorted_unique)}")
            print(f"    Distribution: {df_discretized_final[col].value_counts().sort_index().to_dict()}")
            
        elif col == 'foreign_series':
            # Bin into 0 or 1 (binary) using the original normalized data
            df_discretized_final[col] = pd.cut(df_normalized[col], bins=2, labels=['0', '1'])
            
        elif unique_vals == 1:
            # Constant feature
            df_discretized_final[col] = 'Constant'
            
        elif unique_vals == 2:
            df_discretized_final[col] = pd.cut(df_normalized[col], bins=2, labels=['Low', 'High'])
            
        elif unique_vals <= 5:
            # For 3-5 unique values, map directly to preserve ordinal structure
            sorted_unique = sorted(df_normalized[col].unique())
            level_mapping = {val: f'Level_{i}' for i, val in enumerate(sorted_unique)}
            df_discretized_final[col] = df_normalized[col].map(level_mapping)
            
        else:
            # Default to 5 bins for columns with more unique values
            df_discretized_final[col] = pd.qcut(df_normalized[col], q=5, 
                                                 labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'], 
                                                 duplicates='drop')
    
    print("✓ Skipped columns handled with custom discretization")
    print()

Handling skipped columns with manual discretization:
  federal_financing_bank: 4 unique values
  foreign_series: 2 unique values



In [165]:
# Convert all object columns to category dtype for better memory efficiency
for col in df_discretized_final.columns:
    if df_discretized_final[col].dtype == 'object':
        df_discretized_final[col] = df_discretized_final[col].astype('category')

print("✓ All categorical features converted to 'category' dtype")
print()

# Verify discretization - check value distributions
print("Sample of discretized features:")
print(df_discretized_final.head(10))
print()

# Show distribution for a few key features
sample_features = ['close', 'volume', 'weighted_sentiment', 'federal_financing_bank', 'foreign_series']
for feature in sample_features:
    if feature in df_discretized_final.columns:
        print(f"\nDistribution of discretized '{feature}':")
        print(df_discretized_final[feature].value_counts().sort_index())

print()
print("Data types after discretization:")
print(df_discretized_final.dtypes)

✓ All categorical features converted to 'category' dtype

Sample of discretized features:
      open     high      low    close    volume federal_financing_bank  \
0  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
1  VeryLow  VeryLow  VeryLow  VeryLow      High                  Bank4   
2  VeryLow  VeryLow  VeryLow  VeryLow      High                  Bank4   
3  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
4  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
5  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
6  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
7  VeryLow  VeryLow  VeryLow  VeryLow  VeryHigh                  Bank4   
8  VeryLow  VeryLow  VeryLow  VeryLow       Low                  Bank4   
9  VeryLow  VeryLow  VeryLow  VeryLow    Medium                  Bank4   

  foreign_series government_account_series  \
0              1                   VeryLow   
1  

## Step 6: Save Preprocessed Dataset

Export the final preprocessed dataset ready for classification.

In [166]:
# Save the final preprocessed dataset (ready for classification)
output_file_final = 'input_data/bitcoin_sentiment_discretized.csv'
df_discretized_final.to_csv(output_file_final, index=False)

print(f"✓ Final preprocessed dataset saved to: {output_file_final}")
print(f"  Total instances: {len(df_discretized_final)}")
print(f"  Total features: {len(df_discretized_final.columns) - 1}")
print(f"  Label column: 'price_direction'")
print()
print("Dataset is now ready for MyRandomForestClassifier!")
print()
print("Preprocessing Pipeline Summary:")
print(f"  1. Created binary label from daily price changes (Up/Down)")
print(f"  2. Dropped {len(columns_to_drop)} unnecessary columns")
print(f"  3. Normalized {len(numeric_columns)} numeric features (z-score)")
print(f"  4. Discretized into categorical bins (VeryLow to VeryHigh)")
print(f"  5. Label distribution: {df_discretized_final['price_direction'].value_counts().to_dict()}")
print()

✓ Final preprocessed dataset saved to: input_data/bitcoin_sentiment_discretized.csv
  Total instances: 1073
  Total features: 22
  Label column: 'price_direction'

Dataset is now ready for MyRandomForestClassifier!

Preprocessing Pipeline Summary:
  1. Created binary label from daily price changes (Up/Down)
  2. Dropped 6 unnecessary columns
  3. Normalized 22 numeric features (z-score)
  4. Discretized into categorical bins (VeryLow to VeryHigh)
  5. Label distribution: {'Up': 543, 'Down': 530}

