In [1]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

In [None]:
import pandas as pd
import numpy as np
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.myevaluation import (
    stratified_kfold_split, 
    confusion_matrix, 
    accuracy_score,
    binary_precision_score, 
    binary_recall_score, 
    binary_f1_score
)


# Load Bitcoin Sentiment Dataset

In [31]:
# Load the bitcoin sentiment dataset
df = pd.read_csv('input_data/bitcoin_sentiment.csv')

# Print dataset shape
print("Dataset Shape:")
print(f"  Rows: {df.shape[0]}")
print(f"  Columns: {df.shape[1]}")
print()


# Print headers (column names)
print("Column Headers:")
print(df.columns.tolist())
print()

# Print first few rows
print("First 5 Rows:")
print(df.head())
print()

Dataset Shape:
  Rows: 1074
  Columns: 28

Column Headers:
['Unnamed: 0', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'datetime_utc', 'merge_date', 'domestic_series', 'federal_financing_bank', 'foreign_series', 'government_account_series', 'government_account_series_inflation_securities', 'special_purpose_vehicle', 'state_and_local_government_series', 'total_interest-bearing_debt', 'total_marketable', 'total_non-marketable', 'treasury_bills', 'treasury_bonds', 'treasury_floating_rate_notes_(frn)', 'treasury_inflation-protected_securities_(tips)', 'treasury_notes', 'united_states_savings_inflation_securities', 'united_states_savings_securities', 'weighted_sentiment', 'sentiment_missing']

First 5 Rows:
   Unnamed: 0      timestamp      open      high       low     close  \
0           0  1669852800000  17165.44  17317.80  16855.00  16980.08   
1           1  1669939200000  16980.07  17108.25  16791.02  17094.71   
2           2  1670025600000  17094.25  17158.42  16863.58  16

In [25]:
# Print label distribution 
label_column = df.columns[5]
print(f"Label Distribution ('{label_column}'):")
print(df[label_column])
print()
print("Label Proportions:")
print(df[label_column])

Label Distribution ('close'):
0        16980.08
1        17094.71
2        16888.53
3        17108.90
4        16966.05
          ...    
1069    101468.15
1070    103869.00
1071    101290.50
1072    103284.27
1073    102249.20
Name: close, Length: 1074, dtype: float64

Label Proportions:
0        16980.08
1        17094.71
2        16888.53
3        17108.90
4        16966.05
          ...    
1069    101468.15
1070    103869.00
1071    101290.50
1072    103284.27
1073    102249.20
Name: close, Length: 1074, dtype: float64


In [44]:
# Examine the weighted_sentiment column more closely
print("Weighted Sentiment Statistics:")
print(df['weighted_sentiment'].describe())
print()

# Check for missing values
if df.isnull().values.any():
    print("Missing Values per Column:")
    print(df.isnull().sum())
else:
    print("No missing values found in the dataset.")
print()

print("-" * 70)
print()

# Check sentiment_missing column values for anything other than zero
print("Checking for rows where 'sentiment_missing' != 0:")
if (df['sentiment_missing'] != 0).any():
    print(df[df['sentiment_missing'] != 0])
else:
    print("No rows with sentiment_missing != 0 found.")
print()

Weighted Sentiment Statistics:
count    1074.000000
mean        0.347973
std         0.274657
min        -0.749771
25%         0.171151
50%         0.376796
75%         0.540075
max         0.952912
Name: weighted_sentiment, dtype: float64

No missing values found in the dataset.

----------------------------------------------------------------------

Checking for rows where 'sentiment_missing' != 0:
No rows with sentiment_missing != 0 found.



In [7]:
# Create binary classification label from weighted_sentiment
# Positive sentiment (>0) vs Negative/Neutral sentiment (<=0)
df['sentiment_label'] = df['weighted_sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative/Neutral')

print("Classification Label Distribution:")
print(df['sentiment_label'].value_counts())
print()
print("Label Proportions:")
print(df['sentiment_label'].value_counts(normalize=True))
print()
print(f"Total instances: {len(df)}")

Classification Label Distribution:
sentiment_label
Positive            949
Negative/Neutral    125
Name: count, dtype: int64

Label Proportions:
sentiment_label
Positive            0.883613
Negative/Neutral    0.116387
Name: proportion, dtype: float64

Total instances: 1074


# Discretize Close Price by Daily Change

In [18]:
# Reload the original dataset
df_original = pd.read_csv('input_data/bitcoin_sentiment.csv')

# Create the discretized label by comparing close with previous day's close
# First row will be dropped since there's no previous day
df_original['price_direction'] = 'Down'  # Default value

# Compare current close with previous close
for i in range(1, len(df_original)):
    if df_original.loc[i, 'close'] > df_original.loc[i-1, 'close']:
        df_original.loc[i, 'price_direction'] = 'Up'
    else:
        df_original.loc[i, 'price_direction'] = 'Down'

# Remove the first row (no previous day to compare)
df_discretized = df_original.iloc[1:].copy()
df_discretized = df_discretized.reset_index(drop=True)

print("Discretized Label Distribution (price_direction):")
print(df_discretized['price_direction'].value_counts())
print()
print("Label Proportions:")
print(df_discretized['price_direction'].value_counts(normalize=True))
print()
print(f"Total instances after discretization: {len(df_discretized)}")
print(f"(Original: {len(df_original)}, Removed first row: 1)")

Discretized Label Distribution (price_direction):
price_direction
Up      543
Down    530
Name: count, dtype: int64

Label Proportions:
price_direction
Up      0.506058
Down    0.493942
Name: proportion, dtype: float64

Total instances after discretization: 1073
(Original: 1074, Removed first row: 1)


In [46]:
# Eliminate unnecessary, sequential columns from the discretized dataset 
df_final = df_discretized.drop(columns=['Unnamed: 0','timestamp','datetime_utc', 'merge_date', 'sentiment_missing'])

print("Final Dataset Shape:"),
print(f"  Rows: {df_final.shape[0]}")
print(f"  Columns: {df_final.shape[1]}")
print()



Final Dataset Shape:
  Rows: 1073
  Columns: 24



In [48]:
# Save the discretized dataset to a new CSV file
output_file = 'input_data/bitcoin_sentiment_discretized.csv'
df_final.to_csv(output_file, index=False)

print(f"✓ Discretized dataset saved to: {output_file}")
print(f"  Total instances: {len(df_final)}")
print(f"  Total columns: {len(df_final.columns)}")
print()
print("Sample of discretized data:")
print(df_final[['open', 'close', 'price_direction']].head(10))

✓ Discretized dataset saved to: input_data/bitcoin_sentiment_discretized.csv
  Total instances: 1073
  Total columns: 24

Sample of discretized data:
       open     close price_direction
0  16980.07  17094.71              Up
1  17094.25  16888.53            Down
2  16889.17  17108.90              Up
3  17108.90  16966.05            Down
4  16966.05  17089.05              Up
5  17089.18  16840.00            Down
6  16839.76  17226.01              Up
7  17226.03  17130.59            Down
8  17130.49  17128.10            Down
9  17128.10  17085.21            Down


# Normalize/Standardize Numeric Features

In [50]:
# Identify numeric columns (exclude the label column)
numeric_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()

# Check the scale of numeric features before normalization
print("Feature ranges before normalization:")
for col in numeric_columns:
    print(f"  {col}: [{df_final[col].min():.2f}, {df_final[col].max():.2f}]")
print()

Feature ranges before normalization:
  open: [16439.98, 124723.57]
  high: [16621.00, 126296.00]
  low: [16273.40, 123115.77]
  close: [16439.74, 124720.09]
  volume: [1227.77, 65575.10]
  domestic_series: [7.58, 7.58]
  federal_financing_bank: [2.39, 2.58]
  foreign_series: [0.00, 7.31]
  government_account_series: [2.13, 3.17]
  government_account_series_inflation_securities: [0.99, 1.31]
  special_purpose_vehicle: [2.89, 4.17]
  state_and_local_government_series: [1.81, 3.85]
  total_interest-bearing_debt: [2.22, 3.37]
  total_marketable: [2.24, 3.42]
  total_non-marketable: [2.13, 3.19]
  treasury_bills: [3.46, 5.45]
  treasury_bonds: [3.01, 3.34]
  treasury_floating_rate_notes_(frn): [3.90, 5.54]
  treasury_inflation-protected_securities_(tips): [0.49, 0.96]
  treasury_notes: [1.68, 3.12]
  united_states_savings_inflation_securities: [3.08, 10.15]
  united_states_savings_securities: [2.69, 3.49]
  weighted_sentiment: [-0.75, 0.95]



In [52]:
# Standardize numeric features using z-score normalization
# Formula: (x - mean) / std_dev
# This transforms each feature to have mean=0 and std=1

df_normalized = df_final.copy()

for col in numeric_columns:
    mean = df_normalized[col].mean()
    std = df_normalized[col].std()
    
    # Avoid division by zero for constant columns
    if std > 0:
        df_normalized[col] = (df_normalized[col] - mean) / std
    else:
        print(f"Warning: {col} has std=0, skipping normalization")

print("Numeric features standardized (z-score normalization)")
print()

# Verify normalization
print("Feature statistics after normalization:")
print(df_normalized[numeric_columns].describe().loc[['mean', 'std']])

Numeric features standardized (z-score normalization)

Feature statistics after normalization:
              open          high           low         close        volume  \
mean  5.297616e-17  2.119046e-16  5.297616e-17  3.178570e-16 -2.119046e-16   
std   1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00   

      domestic_series  federal_financing_bank  foreign_series  \
mean     5.288539e-03            7.946424e-16    2.648808e-17   
std      1.206195e-16            1.000000e+00    1.000000e+00   

      government_account_series  \
mean               3.708331e-16   
std                1.000000e+00   

      government_account_series_inflation_securities  ...  total_marketable  \
mean                                    2.648808e-16  ...     -1.059523e-16   
std                                     1.000000e+00  ...      1.000000e+00   

      total_non-marketable  treasury_bills  treasury_bonds  \
mean         -1.059523e-16    5.032735e-16    5.827378e-16   
std   

In [54]:
# Save the normalized dataset
output_file_normalized = 'input_data/bc_dataset_discretized_normalized.csv'
df_normalized.to_csv(output_file_normalized, index=False)

print(f"  Normalized dataset saved to: {output_file_normalized}")
print(f"  Total instances: {len(df_normalized)}")
print(f"  Total columns: {len(df_normalized.columns)}")
print()

# Display sample
print("Sample of normalized data:")
print(df_normalized.head())
print()

  Normalized dataset saved to: input_data/bc_dataset_discretized_normalized.csv
  Total instances: 1073
  Total columns: 24

Sample of normalized data:
       open      high       low     close    volume  domestic_series  \
0 -1.406666 -1.416647 -1.401290 -1.405934  1.199281         0.005289   
1 -1.403161 -1.415129 -1.399025 -1.412264  0.200503         0.005289   
2 -1.409456 -1.413872 -1.398423 -1.405499  0.519543         0.005289   
3 -1.402712 -1.407077 -1.398974 -1.409884  2.365023         0.005289   
4 -1.407096 -1.416553 -1.397762 -1.406108  1.597814         0.005289   

   federal_financing_bank  foreign_series  government_account_series  \
0                1.017022        2.816783                  -3.129045   
1                1.017022        2.816783                  -3.129045   
2                1.017022        2.816783                  -3.129045   
3                1.017022        2.816783                  -3.129045   
4                1.017022        2.816783              