# Bitcoin Price Prediction Project
## Muhammad Faisal Kamran 2023-BS-AI-025

# Data Preprocessing Steps

### We Perform the follwing preprocessing steps on our model
1. Reading Data and Exploring Data
2. Cleansing Data
3. Outlier Detection and Removing
4. Data Transformation (Normalize Data / Rescale Data) 
5. Dimensionality Reduction(PCA)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 1: Reading and Exploring Data

In [2]:
data = pd.read_csv('Bitcoin.csv')

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-09-17,465.864014,468.174011,452.421997,457.334015,457.334015,21056800
1,2014-09-18,456.859985,456.859985,413.104004,424.440002,424.440002,34483200
2,2014-09-19,424.102997,427.834991,384.532013,394.79599,394.79599,37919700
3,2014-09-20,394.673004,423.29599,389.882996,408.903992,408.903992,36863600
4,2014-09-21,408.084991,412.425995,393.181,398.821014,398.821014,26580100


In [4]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
2708,2022-02-15,42586.464844,44667.21875,42491.035156,44575.203125,44575.203125,22721659051
2709,2022-02-16,44578.277344,44578.277344,43456.691406,43961.859375,43961.859375,19792547657
2710,2022-02-17,43937.070313,44132.972656,40249.371094,40538.011719,40538.011719,26246662813
2711,2022-02-18,40552.132813,40929.152344,39637.617188,40030.976563,40030.976563,23310007704
2712,2022-02-19,40022.132813,40246.027344,40010.867188,40126.429688,40126.429688,22263900160


In [5]:
data.shape

(2713, 7)

In [6]:
data.sample(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1404,2018-07-22,7417.799805,7537.950195,7383.819824,7418.490234,7418.490234,3695460096
1362,2018-06-10,7499.549805,7499.549805,6709.069824,6786.02002,6786.02002,5804839936
2329,2021-02-01,33114.578125,34638.214844,32384.228516,33537.175781,33537.175781,61400400660
2154,2020-08-10,11662.256836,12045.140625,11662.256836,11878.111328,11878.111328,26114112569
787,2016-11-12,716.752014,717.14801,704.034973,705.054016,705.054016,64622500
2702,2022-02-09,44096.703125,44727.800781,43232.96875,44338.796875,44338.796875,23245887300
1223,2018-01-22,11633.099609,11966.400391,10240.200195,10931.400391,10931.400391,10537400320
1167,2017-11-27,9352.719727,9818.349609,9352.719727,9818.349609,9818.349609,5653320192
84,2014-12-10,352.204987,352.384003,346.36499,346.36499,346.36499,16427700
2478,2021-06-30,35908.386719,36074.757813,34086.152344,35040.835938,35040.835938,34059036099


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2713 non-null   object 
 1   Open       2713 non-null   float64
 2   High       2713 non-null   float64
 3   Low        2713 non-null   float64
 4   Close      2713 non-null   float64
 5   Adj Close  2713 non-null   float64
 6   Volume     2713 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 148.5+ KB


In [8]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2713.0,2713.0,2713.0,2713.0,2713.0,2713.0
mean,11311.041069,11614.292482,10975.555057,11323.914637,11323.914637,14704620000.0
std,16106.428891,16537.390649,15608.57256,16110.36501,16110.36501,20016270000.0
min,176.897003,211.731003,171.509995,178.102997,178.102997,5914570.0
25%,606.396973,609.260986,604.109985,606.718994,606.718994,79910800.0
50%,6301.569824,6434.617676,6214.220215,6317.609863,6317.609863,5098183000.0
75%,10452.399414,10762.644531,10202.387695,10462.259766,10462.259766,24569920000.0
max,67549.734375,68789.625,66382.0625,67566.828125,67566.828125,350967900000.0


# 2: Cleansing Data

In [9]:
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date').reset_index(drop=True)
data = data.drop_duplicates()

# 3: Outlier Detection and Removal

In [10]:
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs(stats.zscore(data[numeric_cols]))
data = data[(z_scores < 3).all(axis=1)]

# 4: Data Transformation (Normalize Data / Rescale Data) 

In [11]:
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


# 5: Dimensionality Reduction

In [12]:
pca = PCA(n_components=0.95)  # Keep 95% variance
data_pca = pca.fit_transform(data[numeric_cols])