# Importing Necessary Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Importing Dataset

In [None]:
df=pd.read_csv('/kaggle/input/cryptocurrencypricehistory/coin_Tether.csv')
df.head()

# Analysing The Dataset

In [None]:
df.describe()

In [None]:
df.describe().T

* The dataset shows significant variation across different metrics, particularly in volume and market capitalization. The high and low values, as well as the opening and closing values, are relatively stable, with small variations. 
* This suggests that while the price values do not fluctuate wildly, the volume and market capitalization can vary greatly, indicating periods of higher and lower trading activity.

In [None]:
df.isna().sum()

In [None]:
df.index = pd.to_datetime(df['Date'])
df.index

In [None]:
new_df=df.drop(['SNo', 'Symbol', 'Name','Date'], axis=1)


In [None]:
new_df.info()

In [None]:
new_df.index.min()

In [None]:
new_df.index.max()

In [None]:
new_df.head()

In [None]:
new_df.tail()

# Feature selection

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
all_features = ['High', 'Low', 'Open', 'Volume', 'Marketcap']


X = new_df[all_features]
y = new_df['Close']  

In [None]:
model = RandomForestRegressor(random_state=42)

model.fit(X, y)
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

In [None]:
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, all_features[indices[f]], feature_importances[indices[f]]))

In [None]:
plt.figure(figsize=(8,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), feature_importances[indices], color="b", align="center")
plt.xticks(range(X.shape[1]), [all_features[i] for i in indices])
plt.xlim([-1, X.shape[1]])
plt.ylabel("Importance")
plt.xlabel("Feature")
plt.show()

* Feature selection confirm that all these fetures are important,as we already take them for our training and drop rest of them

# Visualization

In [None]:
new_df['Close'].plot(figsize=(8, 5))
plt.title('Closing Price Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

> * *Above Plot suggest that the price of tether coin remains constant through out the dataset, just a spike at beginning because at that time coin listed at trading forum*

In [None]:

# Assuming your data is in a pandas DataFrame called 'df'
corr_matrix = new_df.corr()

# Plot the correlation matrix
plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.hist(new_df['Close'], bins=50, edgecolor='black')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

> * *Above Plot suggest that the distribution of price of tether coin remains constant through out the dataset*

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(new_df['Close'], new_df['Volume'], alpha=0.5)
plt.title('Price vs. Volume')
plt.xlabel('Price')
plt.ylabel('Volume')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(8,5))


ax1.plot(new_df['Close'], color='y', label='Price')
ax1.set_xlabel('Date')
ax1.set_ylabel('Price', color='y')
ax1.tick_params('y', colors='y')


ax2 = ax1.twinx()
ax2.plot(new_df['Volume'], color='r', label='Volume')
ax2.set_ylabel('Volume', color='r')
ax2.tick_params('y', colors='r')


lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

plt.title('Price and Volume Over Time')
plt.show()

> * *Above Plot suggest that the of price of tether coin remains constant even though the volume of coin keep increasing*

# Model Selection

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
X_test.head()

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Model Evaluation

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    return mse, r2, mae, evs

In [None]:
results = {}
for model_name, model in models.items():
    mse, r2, mae, evs = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
    results[model_name] = {'MSE': mse, 'R^2': r2, 'MAE': mae, 'Explained Variance Score': evs}

In [None]:
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Mean Squared Error: {metrics['MSE']}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"Mean Absolute Error: {metrics['MAE']}")
    print(f"Explained Variance Score: {metrics['Explained Variance Score']}")
    print("-" * 30)

**Here Random forest performing Well, So now we do hyperparameter tuning**

# Hyperparameter Tuning On Best Model

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

In [None]:
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['auto', 'sqrt']
}

In [None]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Negative MSE:", grid_search.best_score_)

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

In [None]:
print("\nEvaluation Metrics on Test Set:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Explained Variance Score: {evs}")

# Saving Model

In [None]:
import joblib

joblib.dump(best_model,'rn.pkl')



In [None]:
loaded_model = joblib.load('rn.pkl')

y_pred = loaded_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

In [None]:
print("\nEvaluation Metrics on Test Set:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Explained Variance Score: {evs}")

# Model Training Without Python package

# Random Forest 

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from joblib import Parallel, delayed

class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, bootstrap=True, n_jobs=1, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        self.trees = Parallel(n_jobs=self.n_jobs)(
            delayed(self._train_tree)(X, y, i) for i in range(self.n_estimators)
        )

    def _train_tree(self, X, y, tree_idx):
        if self.random_state is not None:
            np.random.seed(self.random_state + tree_idx)
        tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split, random_state=self.random_state)
        if self.bootstrap:
            X_sample, y_sample = self._bootstrap_sample(X, y)
        else:
            X_sample, y_sample = X, y
        tree.fit(X_sample, y_sample)
        return tree

    def _bootstrap_sample(self, X, y):
        num_samples = X.shape[0]
        sample_indices = np.random.choice(num_samples, size=num_samples, replace=True)
        return X[sample_indices], y[sample_indices]

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.trees)))
        for idx, tree in enumerate(self.trees):
            predictions[:, idx] = tree.predict(X)
        return np.mean(predictions, axis=1)

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True, n_jobs=-1, random_state=42)
rf.fit(X_train_scaled, y_train.values)


y_pred = rf.predict(X_test_scaled)


mse = np.mean((y_test.values - y_pred) ** 2)
print("Test MSE:", mse)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

In [None]:
print("\nEvaluation Metrics on Test Set:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Explained Variance Score: {evs}")

# Linear Regression

In [None]:
class LinearRegression:
    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X, y):
       
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        
        X_b_T = X_b.T
        self.weights = np.linalg.inv(X_b_T @ X_b) @ X_b_T @ y

    def predict(self, X):
        
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        
        return X_b @ self.weights

In [None]:
  
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train.values)

   
    y_pred = lr.predict(X_test_scaled)

   
    mse = np.mean((y_test.values - y_pred) ** 2)
    print("Test MSE:", mse)
    print("Test R^2 Score:", 1 - (mse / np.var(y_test.values)))

    
    print("Coefficients:", lr.weights[1:])
    print("Intercept:", lr.weights[0])


# Create Test Dataframe for csv

In [None]:
#prediction_df=pd.DataFrame(y_pred,columns=['Prediction'])

In [None]:
#X_test = X_test.reset_index(drop=True)

In [None]:
#test_df = pd.concat([X_test,prediction_df], axis=1)

In [None]:
#test_df.head()

In [None]:
#test_df.to_csv('/kaggle/working/test_pred_data_M.csv',index=False)

In [None]:
#df=pd.read_csv('/kaggle/working/test_pred_data_M.csv')

In [None]:
#df.head()