### Import of libraries and defining paths to files

In [3]:
import numpy as np
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [4]:
BASE_PATH = r"C:\Users\micha\OneDrive\Pulpit\Project\Bitcoin_pred\data"

PATH_TO_ORIGINAL_DATA = f"{BASE_PATH}\\Bitcoin.csv"

# Path to cleaned data with missing values removed
PATH_TO_CLEANED_DATA = f"{BASE_PATH}/cleaned_data.csv"

# Path to the file with the dependent variable
PATH_TO_TARGET_DATA = f"{BASE_PATH}/target_data.csv"

# Path to the file with universal features created for time series (e.g., quarter, month, day of the week)
PATH_TO_TIME_FEATURES = f"{BASE_PATH}/time_features.csv"

# Path to the file with features specific to the data
PATH_TO_SPECIFIC_FEATURES = f"{BASE_PATH}/specific_features.csv"

# Paths to files with training, validation, and testing data
PATH_TO_TRAINING_DATA = f"{BASE_PATH}/training_data.csv"
PATH_TO_VALIDATION_DATA = f"{BASE_PATH}/validation_data.csv"
PATH_TO_TESTING_DATA = f"{BASE_PATH}/testing_data.csv"

### Analysis of raw data and creation of the target variable

In [5]:
bitcoin_df = pd.read_csv(PATH_TO_ORIGINAL_DATA)

In [6]:
bitcoin_df

Unnamed: 0,date,price,total_volume,market_cap,coin_name
0,2015-01-01 00:00:00.000,313.992000,4.699936e+07,4.293958e+09,bitcoin
1,2015-01-02 00:00:00.000,314.446000,3.885591e+07,4.301448e+09,bitcoin
2,2015-01-03 00:00:00.000,286.572000,1.187789e+08,3.921358e+09,bitcoin
3,2015-01-04 00:00:00.000,260.936000,2.055001e+08,3.571640e+09,bitcoin
4,2015-01-05 00:00:00.000,273.220000,1.550381e+08,3.740880e+09,bitcoin
...,...,...,...,...,...
3295,2024-01-10 00:00:00.000,46105.946078,3.988792e+10,9.021669e+11,bitcoin
3296,2024-01-11 00:00:00.000,46632.313148,5.203006e+10,9.152593e+11,bitcoin
3297,2024-01-12 00:00:00.000,46314.355542,4.919813e+10,9.098464e+11,bitcoin
3298,2024-01-13 00:00:00.000,42893.929606,4.591330e+10,8.383835e+11,bitcoin


In [7]:
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3300 entries, 0 to 3299
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          3300 non-null   object 
 1   price         3300 non-null   float64
 2   total_volume  3300 non-null   float64
 3   market_cap    3299 non-null   float64
 4   coin_name     3300 non-null   object 
dtypes: float64(3), object(2)
memory usage: 129.0+ KB


In [8]:
unique_values = bitcoin_df['coin_name'].unique()
print(unique_values)

['bitcoin']


The value in the "coin_name" column does not provide any information for the model, therefore it will be removed. There is one value missing in the "market_cap" column, which will require further analysis. It is also necessary to change the data type in the "date" column to datetime.

In [9]:
bitcoin_df.drop(['coin_name'],axis=1,inplace=True)
bitcoin_df['date'] = pd.to_datetime(bitcoin_df['date'], format='%Y-%m-%d')
rows_with_null = bitcoin_df[bitcoin_df['market_cap'].isnull()]
print(rows_with_null)

          date     price  total_volume  market_cap
821 2017-04-02  1098.068  8.023151e+07         NaN


In [10]:
filtered_df = bitcoin_df[(bitcoin_df['date'] >= '2017-03-28') & (bitcoin_df['date'] <= '2017-04-10')]
print(filtered_df)

          date        price  total_volume    market_cap
816 2017-03-28  1044.722000  1.276436e+08  1.696879e+10
817 2017-03-29  1038.617198  1.211691e+08  1.687138e+10
818 2017-03-30  1031.492906  1.024388e+08  1.675784e+10
819 2017-03-31  1078.274711  1.332871e+08  1.751958e+10
820 2017-04-01  1085.253000  8.775043e+07  1.763483e+10
821 2017-04-02  1098.068000  8.023151e+07           NaN
822 2017-04-03  1139.308275  1.317652e+08  1.851790e+10
823 2017-04-04  1140.497590  1.082230e+08  1.853906e+10
824 2017-04-05  1132.339905  8.743962e+07  1.840873e+10
825 2017-04-06  1194.211000  1.283093e+08  1.941680e+10
826 2017-04-07  1189.775124  9.084018e+07  1.934705e+10
827 2017-04-08  1182.271000  4.389967e+07  1.922711e+10
828 2017-04-09  1187.699000  4.115143e+07  1.931619e+10
829 2017-04-10  1210.515000  7.605022e+07  1.969102e+10


In [11]:
bitcoin_df['market_cap'] = bitcoin_df['market_cap'].interpolate(method='linear')

In [12]:
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3300 entries, 0 to 3299
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          3300 non-null   datetime64[ns]
 1   price         3300 non-null   float64       
 2   total_volume  3300 non-null   float64       
 3   market_cap    3300 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 103.2 KB


The missing value in the "market_cap" column was filled using a linear value between two available values. The next step will be to prepare the target variable, i.e., the value of bitcoin price 3 days ahead.

In [13]:
# Create the target variable by shifting the values from the "price" column forward by 3 days.
bitcoin_df['price_target'] = bitcoin_df['price'].shift(3)

# Remove the first 3 rows, which do not have the target variable.
bitcoin_df = bitcoin_df.iloc[3:]

# Set the "date" variable as the index to facilitate further analysis.
bitcoin_df.set_index('date', inplace=True)

In [14]:
bitcoin_df

Unnamed: 0_level_0,price,total_volume,market_cap,price_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-04,260.936000,2.055001e+08,3.571640e+09,313.992000
2015-01-05,273.220000,1.550381e+08,3.740880e+09,314.446000
2015-01-06,285.373800,9.700290e+07,3.908408e+09,286.572000
2015-01-07,295.872500,1.106742e+08,4.053239e+09,260.936000
2015-01-08,284.452500,8.657054e+07,3.897824e+09,273.220000
...,...,...,...,...
2024-01-10,46105.946078,3.988792e+10,9.021669e+11,43956.120717
2024-01-11,46632.313148,5.203006e+10,9.152593e+11,43883.743879
2024-01-12,46314.355542,4.919813e+10,9.098464e+11,46936.185561
2024-01-13,42893.929606,4.591330e+10,8.383835e+11,46105.946078


In [15]:
# Save the remaining columns (all except the last one) to PATH_TO_CLEANED_DATA
bitcoin_df.iloc[:, :-1].to_csv(PATH_TO_CLEANED_DATA)

# Save the last column to PATH_TO_TARGET_DATA
bitcoin_df.iloc[:, -1].to_csv(PATH_TO_TARGET_DATA)