In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read and show crypto terminology table
df = pd.read_csv('crypto-terminology.csv', index_col=[0,1])
df.dropna()

Unnamed: 0,Unnamed: 1
Date,Indicates when the data was recorded.
Currency,Specifies the cryptocurrency(name of cryptocurrency).
Total Volume,"Indicates the trading volume, which represents the total quantity of the cryptocurrency traded during the day."
Market Cap,Market Capitalization column represents the total market value of the cryptocurrency. It is calculated by multiplying the current price (Close) by the total circulating supply of the cryptocurrency.
Price Lag,Represent the price of Bitcoin from the previous day.
Price Rolling,The average price over the past days (in our data 3 and 7 days period).
Price Rolling Standard Deviation(std),The rolling standard deviation measures the variability or volatility of the price over a set window
Bitcoin Halving,"Every 210,000 blocks, or roughly every four years, miners' rewards are cut in half."
ETF,Exchange-traded fund (ETF) is a basket of securities that trades on an exchange just like a stock does.
Bull run,"A bull market, or bull run, is defined as a period of time where the majority of investors are buying, demand outweighs supply, market confidence is at a high, and prices are rising."


In [3]:
# Read and show bitcoin table
df = pd.read_csv('bitcoin.csv')
df.head()

Unnamed: 0,date,price,total_volume,market_cap,coin_name
0,2015-01-01 00:00:00.000,313.992,46999360.0,4293958000.0,bitcoin
1,2015-01-02 00:00:00.000,314.446,38855910.0,4301448000.0,bitcoin
2,2015-01-03 00:00:00.000,286.572,118778900.0,3921358000.0,bitcoin
3,2015-01-04 00:00:00.000,260.936,205500100.0,3571640000.0,bitcoin
4,2015-01-05 00:00:00.000,273.22,155038100.0,3740880000.0,bitcoin


In [4]:
# Convert the 'date' column to a datetime object
df['date'] = pd.to_datetime(df['date'])

In [5]:
# Extract date-related features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday

In [6]:
# Check for missing values
missing_values = df.isnull().sum()

In [10]:
# Check for missing values in 'market_cap' and impute with the mean
df['market_cap'] = df['market_cap'].fillna(df['market_cap'].mean())

# Verify if the missing value has been handled
print("---------")
print(df['market_cap'].isnull().sum())

---------
0


In [11]:
# Create lag features for the 'price' column
for lag in range(1, 4):  # Create lag of 1 day, 2 days, and 3 days
    df[f'price_lag_{lag}'] = df['price'].shift(lag)
    
# Calculate rolling window statistics for 'price' and 'total_volume'
window_sizes = [3, 7]  # Example window sizes of 3 and 7 days
for window in window_sizes:
    df[f'price_rolling_mean_{window}'] = df['price'].rolling(window=window).mean()
    df[f'price_rolling_std_{window}'] = df['price'].rolling(window=window).std()
    df[f'volume_rolling_mean_{window}'] = df['total_volume'].rolling(window=window).mean()
    df[f'volume_rolling_std_{window}'] = df['total_volume'].rolling(window=window).std()
df

Unnamed: 0,date,price,total_volume,market_cap,coin_name,year,month,day,weekday,price_lag_1,price_lag_2,price_lag_3,price_rolling_mean_3,price_rolling_std_3,volume_rolling_mean_3,volume_rolling_std_3,price_rolling_mean_7,price_rolling_std_7,volume_rolling_mean_7,volume_rolling_std_7
0,2015-01-01,313.992000,4.699936e+07,4.293958e+09,bitcoin,2015,1,1,3,,,,,,,,,,,
1,2015-01-02,314.446000,3.885591e+07,4.301448e+09,bitcoin,2015,1,2,4,313.992000,,,,,,,,,,
2,2015-01-03,286.572000,1.187789e+08,3.921358e+09,bitcoin,2015,1,3,5,314.446000,313.992000,,305.003333,15.963617,6.821140e+07,4.398165e+07,,,,
3,2015-01-04,260.936000,2.055001e+08,3.571640e+09,bitcoin,2015,1,4,6,286.572000,314.446000,313.992000,287.318000,26.762799,1.210450e+08,8.334523e+07,,,,
4,2015-01-05,273.220000,1.550381e+08,3.740880e+09,bitcoin,2015,1,5,0,260.936000,286.572000,314.446000,273.576000,12.821707,1.597724e+08,4.355401e+07,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3306,2024-01-21,41626.107110,9.533488e+09,8.172176e+11,bitcoin,2024,1,21,6,41600.940933,41261.394798,42713.859187,41496.147614,203.690936,1.970060e+10,8.813011e+09,42105.510362,704.597840,2.028964e+10,5.430294e+09
3307,2024-01-22,41541.899457,8.032697e+09,8.147094e+11,bitcoin,2024,1,22,0,41626.107110,41600.940933,41261.394798,41589.649167,43.224532,1.399135e+10,9.052128e+09,42068.505595,729.617823,1.901303e+10,7.126548e+09
3308,2024-01-23,39504.730058,3.120285e+10,7.750209e+11,bitcoin,2024,1,23,1,41541.899457,41626.107110,41600.940933,40890.912208,1201.207080,1.625634e+10,1.296579e+10,41628.133312,1164.783152,2.023707e+10,8.462806e+09
3309,2024-01-24,39833.454105,2.968333e+10,7.791334e+11,bitcoin,2024,1,24,2,39504.730058,41541.899457,41626.107110,40293.361207,1093.686745,2.297296e+10,1.296093e+10,41154.626521,1116.671519,2.133139e+10,9.195775e+09


In [12]:
# Drop unnecessary columns
df.drop(columns=['coin_name'], inplace=True)  # coin_name has only one value(it's unnecessary)

In [13]:
# Perform a correlation analysis
correlation_matrix = df.corr()

In [14]:
# Display the first few rows of the modified DataFrame and the correlation matrix
df.head(), missing_values, correlation_matrix['price'].sort_values(ascending=False)
#this is helping to see which values might have strong influence on the price of Bitcoin.

(        date    price  total_volume    market_cap  year  month  day  weekday  \
 0 2015-01-01  313.992  4.699936e+07  4.293958e+09  2015      1    1        3   
 1 2015-01-02  314.446  3.885591e+07  4.301448e+09  2015      1    2        4   
 2 2015-01-03  286.572  1.187789e+08  3.921358e+09  2015      1    3        5   
 3 2015-01-04  260.936  2.055001e+08  3.571640e+09  2015      1    4        6   
 4 2015-01-05  273.220  1.550381e+08  3.740880e+09  2015      1    5        0   
 
    price_lag_1  price_lag_2  price_lag_3  price_rolling_mean_3  \
 0          NaN          NaN          NaN                   NaN   
 1      313.992          NaN          NaN                   NaN   
 2      314.446      313.992          NaN            305.003333   
 3      286.572      314.446      313.992            287.318000   
 4      260.936      286.572      314.446            273.576000   
 
    price_rolling_std_3  volume_rolling_mean_3  volume_rolling_std_3  \
 0                  NaN             

### ZAKLJUCAK

In [None]:
"""
1.  Kao neko ko je nov u kripto svetu, bilo mi je potrebno da detaljnije istrazim termine kako bih razumeo
    kako trziste funkcionise i time razumeo koji su faktori koji uticu na cenu. 
    Zato sam kreirao tabelu sa terminologijom.
2.  Podaci iz tabele "bitcoin.csv" su dobri za preprocess, i za razumevanje.
3.  Jedan od faktora za pracenje cene jeste vreme. Svakim dan, mesec, godina, cena varira.
    Zato sam kreirao date objekat, kako bih iz njega izvlacio gore navedene vrednosti.
4.  Resio sam da uradim data cleaning. Zato sam proverio da li postoje vrednosti koje treba dopuniti
    ili obrisati. Ova baza je dobra, falio je samo 1 podatak u market cap koloni.
5.  Sada je bilo potrebno dodati odredjene kolone koje takodje mogu uticati na predvidjanje cene.
    Zato su dodate price i volume lag, rolling i standard devitation (std)
6.  Obzirom da istrazujemo samo bitcoin, kolona imena valute nam je bila suvisna.
7.  Sigurno da je potrebno odraditi dopunjivanje NaN vrednosti u novim kolonama, i samim tim bi rezultat
    korelacija verovatno bio drugaciji?
"""