# 1.Project Setup # Objective: Predict future Bitcoin prices using historical data and machine learning models

# 2. Data Collection

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("D:\\csss\\coinmarketcap_06122017.csv")
df.head()
print(df.info())
#df = pd.read_csv("D:\\csss\\coinmarketcap_06122017.csv")  # Make sure you're reading a CSV or creating a DataFrame

# Display basic statistics for numerical columns
print("\nStatistical Summary:")
print(df.describe())

In [None]:
print("sum of null",df.isna().sum())

#  Remove the attributes with missing values

# .3 Data Preprocessing

In [None]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv("D:\\csss\\coinmarketcap_06122017.csv")

# Show the number of null values in each column before filling
print("Sum of null values before filling:\n", df.isna().sum())

# Replace missing values in 'max_supply' with the mean
print("Replacing missing values with mean in 'max_supply'")
print("Before Replacing max_supply:\n", df['max_supply'].head(7))
mean_value = df['max_supply'].mean()
print("\nMean of 'max_supply' column:", mean_value)
df['max_supply'] = df['max_supply'].fillna(mean_value)
print("\nAfter Replacing with Mean:\n", df['max_supply'].head(7))

# Replace missing values in 'available_supply' with the median
print("\nReplacing missing values with median in 'available_supply'")
print("Before Replacing available_supply:\n", df['available_supply'].head(7))
median_value = df['available_supply'].median()
print("\nMedian of 'available_supply' column:", median_value)
df['available_supply'] = df['available_supply'].fillna(median_value)
print("\nAfter Replacing with Median:\n", df['available_supply'].head(7))

# Replace missing values in 'market_cap_usd' with the mode
print("\nReplacing missing values with mode in 'market_cap_usd'")
print("Before Replacing market_cap_usd:\n", df['market_cap_usd'].head(7))
mode_value = df['market_cap_usd'].mode()[0]
print("\nMode of 'market_cap_usd' column:", mode_value)
df['market_cap_usd'] = df['market_cap_usd'].fillna(mode_value)
print("\nAfter Replacing with Mode:\n", df['market_cap_usd'].head(7))

# Replace missing values in 'total_supply' with the mode
print("\nReplacing missing values with mode in 'total_supply'")
print("Before Replacing total_supply:\n", df['total_supply'].head(7))
mode_value_total_supply = df['total_supply'].mode()[0]
print("\nMode of 'total_supply' column:", mode_value_total_supply)
df['total_supply'] = df['total_supply'].fillna(mode_value_total_supply)
print("\nAfter Replacing with Mode:\n", df['total_supply'].head(7))

# Replace missing values in 'percent_change_7d' with the mode
percent_change_7d_mode = df['percent_change_7d'].mode()[0]
print("\nMode of 'percent_change_7d':", percent_change_7d_mode)
df['percent_change_7d'] = df['percent_change_7d'].fillna(percent_change_7d_mode)

# Replace missing values in 'percent_change_24h' with the mode
percent_change_24h_mode = df['percent_change_24h'].mode()[0]
print("\nMode of 'percent_change_24h':", percent_change_24h_mode)
df['percent_change_24h'] = df['percent_change_24h'].fillna(percent_change_24h_mode)

# Replace remaining NaN values with -1
df = df.replace({np.nan: -1})

# Show the number of null values in each column after filling
print("\nSum of null values after filling:\n", df.isna().sum())

# Calculate 7-day and 30-day moving averages for 'price_usd'
df['7_day_moving_avg'] = df['price_usd'].rolling(window=7).mean()
df['30_day_moving_avg'] = df['price_usd'].rolling(window=30).mean()

# Calculate price momentum as the difference between the current price and the price 7 days ago
df['price_momentum_7d'] = df['price_usd'] - df['price_usd'].shift(7)

# Calculate the Relative Strength Index (RSI)
def calculate_rsi(series, period=14):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['RSI_14'] = calculate_rsi(df['price_usd'], 14)

# Create 1-day and 7-day lag features for 'price_usd'
df['price_usd_lag1'] = df['price_usd'].shift(1)
df['price_usd_lag7'] = df['price_usd'].shift(7)

# Calculate the difference between 7-day and 30-day moving averages
df['moving_avg_diff'] = df['7_day_moving_avg'] - df['30_day_moving_avg']


# 4. Exploratory Data Analysis (EDA)

In [None]:
# Comparing the original price with the 7-day and 30-day moving averages to smooth out noise and identify underlying trends.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Convert the 'last_updated' Unix timestamp to datetime
df['date'] = pd.to_datetime(df['last_updated'], unit='s')
# Calculate moving averages
df['7_day_moving_avg'] = df['price_usd'].rolling(window=7).mean()
df['30_day_moving_avg'] = df['price_usd'].rolling(window=30).mean()
# Plottings
plt.figure(figsize=(14, 7))
plt.plot(df['date'], df['price_usd'], label='Price USD', color='blue')
plt.plot(df['date'], df['7_day_moving_avg'], label='7-Day Moving Average', color='red')
plt.plot(df['date'], df['30_day_moving_avg'], label='30-Day Moving Average', color='green')
plt.title('Historical Price and Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price USD')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get the number of numeric columns
num_columns = len(df.select_dtypes(include=np.number).columns)

# Determine the number of rows required for the subplots
num_rows = (num_columns // 4) + (num_columns % 4 > 0)

# Step 1: Show Outliers using Boxplots
plt.figure(figsize=(16, num_rows * 4))
for i, column in enumerate(df.select_dtypes(include=np.number).columns):
    plt.subplot(num_rows, 4, i+1)  # Adjusted to handle different number of columns
    sns.boxplot(data=df, x=column)
    plt.title(f'Boxplot of {column}')
plt.tight_layout()
plt.show()

# Step 2: Show Distribution and Frequency Distribution using Histograms and KDE
plt.figure(figsize=(16, num_rows * 4))
for i, column in enumerate(df.select_dtypes(include=np.number).columns):
    plt.subplot(num_rows, 4, i+1)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

# Step 3: Show Correlation Matrix
correlation_matrix = df.corr()
print("Correlation Matrix:\n", correlation_matrix)

# Step 4: Create Scatter Plots for Pairwise Relationships
sns.pairplot(df.select_dtypes(include=np.number))
plt.show()

# Step 5: Create a Heatmap for Correlation Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Heatmap of Correlation Matrix')
plt.show()


# . Preprocessing the Data
The first step is to prepare the data for training, ensuring the input features are properly scaled for SVR.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features to use for regression
X = df[['24h_volume_usd', 'available_supply', 'market_cap_usd', 'max_supply', 
        'percent_change_1h', 'percent_change_24h', 'percent_change_7d', 
        'price_btc', 'total_supply']]

# Target variable (using price_usd as the target)
y = df['price_usd']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (SVR requires feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 2. Support Vector Regression (SVR)
Support Vector Regression (SVR) is sensitive to feature scaling, so we scaled the data in the previous step. Now, we train the SVR model.

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the SVR model
svr_model = SVR(kernel='rbf')  # 'rbf' kernel for non-linear regression
svr_model.fit(X_train_scaled, y_train)

# Make predictions with SVR
y_pred_svr = svr_model.predict(X_test_scaled)

# Evaluate the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("SVR Model Performance:")
print("Mean Squared Error (MSE):", mse_svr)
print("R-squared (R2):", r2_svr)


# 3. Random Decision Trees (ExtraTreesRegressor)
For Random Decision Trees, you can use ExtraTreesRegressor, which is a type of ensemble learning model that creates multiple random decision trees.

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

# Initialize and train the ExtraTreesRegressor (Random Decision Trees)
etr_model = ExtraTreesRegressor(n_estimators=100, random_state=42)
etr_model.fit(X_train, y_train)  # No scaling required for ExtraTreesRegressor

# Make predictions with ExtraTreesRegressor
y_pred_etr = etr_model.predict(X_test)

# Evaluate the ExtraTreesRegressor model
mse_etr = mean_squared_error(y_test, y_pred_etr)
r2_etr = r2_score(y_test, y_pred_etr)

print("Random Decision Trees (ExtraTreesRegressor) Performance:")
print("Mean Squared Error (MSE):", mse_etr)
print("R-squared (R2):", r2_etr)
