In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv("D:\\crypto\\coin_Bitcoin.csv")
df.info()

In [None]:
print(df.isna().sum())
df.head()

In [None]:
df.describe()

In [None]:

# List of columns to plot boxplots for (excluding 'Unnamed: 0')
columns_to_plot = ['Open', 'High', 'Low', 'Close', 'Volume', 'Marketcap']

# Set the size of the figure
plt.figure(figsize=(15, 10))

# Create a boxplot for each column
for i, col in enumerate(columns_to_plot, 1):
    plt.subplot(2, 3, i)  # Create a grid of 2 rows and 3 columns
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

In [None]:

# Calculate Q1, Q3, and IQR for the columns of interest
Q1 = df[['Volume', 'Marketcap']].quantile(0.25)
Q3 = df[['Volume', 'Marketcap']].quantile(0.75)
IQR = Q3 - Q1

# Define the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_no_outliers = df[~((df[['Volume', 'Marketcap']] < lower_bound) | 
                       (df[['Volume', 'Marketcap']] > upper_bound)).any(axis=1)]

# Display the cleaned DataFrame
print("Original DataFrame shape:", df.shape)
print("DataFrame shape after removing outliers:", df_no_outliers.shape)

In [None]:
features = ['SNo','Name','Date','High','Low','Close','Volume']
target = 'Marketcap'

# Drop rows with missing values in features or target
df_filtered = df.dropna(subset=features + [target])
Q1 = df_filtered[target].quantile(0.25)
Q3 = df_filtered[target].quantile(0.75)
IQR = Q3 - Q1

# Define the acceptable range for price_usd
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_filtered_no_price_outliers = df_filtered[(df_filtered[target] >= lower_bound) & (df_filtered[target] <= upper_bound)]

# Check how many rows remain after outlier removal
print(f"Rows before outlier removal: {df_filtered.shape[0]}")
print(f"Rows after removing bitcoin outliers: {df_filtered_no_price_outliers.shape[0]}")


In [None]:
import pandas as pd
from scipy import stats
import numpy as np

# List of numeric columns you want to calculate Z-scores for
features = ['High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']

# Convert relevant columns to numeric just in case there are any unexpected data types
df_filtered[features] = df_filtered[features].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any) after conversion
df_filtered = df_filtered.dropna(subset=features)

# Calculate Z-scores for the selected features
z_scores = np.abs(stats.zscore(df_filtered[features]))

# Keep only rows where all Z-scores are less than 3 (removing outliers)
df_filtered_no_outliers = df_filtered[(z_scores < 3).all(axis=1)]

# Print the number of remaining rows after removing outliers
print(f"Remaining rows after removing outliers: {len(df_filtered_no_outliers)}")

# Optionally, inspect the cleaned data
print(df_filtered_no_outliers.head())


In [None]:
# List of columns to apply the IQR method
columns = ['High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']

# Function to remove outliers using IQR for multiple columns
def remove_outliers_iqr(data, columns):
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter the data for each column
        data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

# Apply the outlier removal function
df_filtered = remove_outliers_iqr(df, columns)

# Plot the data before and after outlier removal
def plot_data_before_after(df_original, df_filtered, column):
    plt.figure(figsize=(12, 6))
    
    # Original data
    plt.subplot(1, 2, 1)
    plt.plot(df_original['Date'], df_original[column], 'b-', label='Original Data')
    plt.title(f'{column} Before Outlier Removal')
    plt.xlabel('Date')
    plt.ylabel(column)
    
    # Filtered data
    plt.subplot(1, 2, 2)
    plt.plot(df_filtered['Date'], df_filtered[column], 'g-', label='Filtered Data')
    plt.title(f'{column} After Outlier Removal')
    plt.xlabel('Date')
    plt.ylabel(column)
    
    plt.tight_layout()
    plt.show()

# Plot graphs for each column before and after outlier removal
for col in columns:
    plot_data_before_after(df, df_filtered, col)


In [None]:
# heat map
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr_matrix = df_filtered_no_outliers[features + [target]].corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Features and Price')
plt.show()

In [None]:
# liner reggression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Extract features and target
#X = df_filtered_no_outliers[features]
#y = df_filtered_no_outliers[target]
features = ['High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
target = 'Close'  # Adjust the target column to your actual target variable
X = features
y = target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

plt.figure(figsize=(18,10))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()],'k--',lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Linear Regression: Actual vs Predicted Prices')
plt.show()