In [1]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('Merged_Dataset.csv')
df.Type.unique()

array(['BTC-USD', 'DOGE-USD', 'ETH-USD', 'ADA-USD'], dtype=object)

In [3]:
# Remove leading and trailing spaces
df['Type'] = df['Type'].str.strip()

# Create a dictionary for mapping
type_mapping = {'BTC-USD': 1, 'DOGE-USD': 2, 'ETH-USD': 3, 'ADA-USD': 4}

# Apply the mapping to the 'Type' column
df['Type'] = df['Type'].map(type_mapping)

df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Type
0,13-09-2015,235.242004,235.934998,229.332001,230.511993,230.511993,1.847880e+07,1
1,14-09-2015,230.608994,232.440002,227.960999,230.643997,230.643997,2.099780e+07,1
2,15-09-2015,230.492004,259.182007,229.822006,230.304001,230.304001,1.917780e+07,1
3,16-09-2015,230.250000,231.214996,227.401993,229.091003,229.091003,2.014420e+07,1
4,17-09-2015,229.076004,230.285004,228.925995,229.809998,229.809998,1.893540e+07,1
...,...,...,...,...,...,...,...,...
8014,9/9/2021,2.406712,2.621054,2.387085,2.517666,2.517666,6.031548e+09,4
8015,9/10/2021,2.517666,2.585671,2.302802,2.385911,2.385911,6.130461e+09,4
8016,9/11/2021,2.385186,2.796062,2.375493,2.633950,2.633950,8.670620e+09,4
8017,9/12/2021,2.641341,2.783103,2.511185,2.581727,2.581727,8.111331e+09,4


In [4]:
df.drop(['Date'], axis = 1)
df.isna().sum()

Date          0
Open         12
High         12
Low          12
Close        12
Adj Close    12
Volume       12
Type          0
dtype: int64

In [5]:
# 'BTC-USD', 'DOGE-USD', 'ETH-USD', 'ADA-USD' are the type 

df_means = df.groupby("Type")[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].mean()

df_means

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,10310.91013,10603.783501,9990.894359,10330.331341,10330.331341,15650600000.0
2,0.024878,0.026871,0.022958,0.024989,0.024989,623265900.0
3,466.105352,484.240219,446.117169,467.53924,467.53924,7595310000.0
4,0.334114,0.352172,0.315527,0.335812,0.335812,1043983000.0


In [6]:
cols_to_fill = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]

for col in cols_to_fill:
    df[col] = df.groupby("Type")[col].transform(lambda x: x.fillna(x.mean()))

In [7]:
df.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Type         0
dtype: int64

In [8]:
y = df.Volume
X = df[["Open", "High", "Low", "Close", "Adj Close", "Type"]]
# BTC-USD = 1, DOGE-USD = 2, ETH-USD = 3, ADA-USD = 4 

In [9]:
y.head()

0    18478800.0
1    20997800.0
2    19177800.0
3    20144200.0
4    18935400.0
Name: Volume, dtype: float64

In [10]:
X.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Type
0,235.242004,235.934998,229.332001,230.511993,230.511993,1
1,230.608994,232.440002,227.960999,230.643997,230.643997,1
2,230.492004,259.182007,229.822006,230.304001,230.304001,1
3,230.25,231.214996,227.401993,229.091003,229.091003,1
4,229.076004,230.285004,228.925995,229.809998,229.809998,1


In [11]:
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

model = LinearRegression()
rf = RandomForestRegressor(n_estimators=30)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 18)

X_train 

Unnamed: 0,Open,High,Low,Close,Adj Close,Type
909,9350.589844,9531.320313,8828.469727,8866.000000,8866.000000,1
1550,7277.197754,7324.156250,7195.527344,7217.427246,7217.427246,1
4393,0.896796,0.908571,0.793460,0.813610,0.813610,3
6407,1819.466309,1860.974731,1793.922363,1846.033691,1846.033691,3
2385,0.000213,0.000218,0.000212,0.000216,0.000216,2
...,...,...,...,...,...,...
1726,9800.215820,9869.237305,9663.216797,9665.533203,9665.533203,1
2885,0.001961,0.001974,0.001895,0.001937,0.001937,2
1144,6336.990234,6349.160156,6316.879883,6317.609863,6317.609863,1
4371,0.294754,0.304494,0.288541,0.296273,0.296273,2


In [12]:
X_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Type
7583,0.099965,0.099965,0.094408,0.098211,0.098211,4
7850,1.190976,1.228019,1.181512,1.202541,1.202541,4
3505,0.002900,0.002906,0.002724,0.002760,0.002760,2
1544,7253.241699,7743.431641,7232.676758,7448.307617,7448.307617,1
300,666.383972,666.383972,633.398987,650.960022,650.960022,1
...,...,...,...,...,...,...
4839,8.520400,8.530890,8.345720,8.422850,8.422850,3
3462,0.001982,0.002015,0.001972,0.001998,0.001998,2
4510,1.422610,1.464690,1.305010,1.371390,1.371390,3
7508,0.034160,0.034995,0.034032,0.034700,0.034700,4


In [13]:
y_test

7583    2.621321e+08
7850    2.554597e+09
3505    3.707799e+07
1544    1.881609e+10
300     1.805360e+08
            ...     
4839    4.486190e+06
3462    1.734883e+07
4510    2.446740e+06
7508    8.634852e+07
5515    1.238780e+09
Name: Volume, Length: 1604, dtype: float64

In [14]:
model.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [15]:
model.score(X_test, y_test)
rf.score(X_test, y_test)

0.7633941494657089

In [16]:
import joblib

joblib.dump(model, 'LinearMode.joblib')
joblib.dump(rf, 'RandomForestRegressor.joblib')

['RandomForestRegressor.joblib']