In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, f1_score

In [17]:
data_train = pd.read_csv(r"C:\Users\pc\Desktop\train.csv")
data_test = pd.read_csv(r"C:\Users\pc\Desktop\test.csv")

In [18]:
# Create sequences
def create_sequences(data, target, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(target[i + window_size])  # Target should correspond to the last element in the current window
    return np.array(X), np.array(y)

In [19]:
def create_test_sequences(data, window_size):
    X_seq = []
    for i in range(len(data) - window_size):
        X_seq.append(data[i:i + window_size])
    return np.array(X_seq)

In [20]:
data_train['datetime'] = pd.to_datetime(data_train['timestamp'], unit='s')
data_train['hour'] = data_train['datetime'].dt.hour
data_train['minute'] = data_train['datetime'].dt.minute
data_train['second'] = data_train['datetime'].dt.second
data_train['day'] = data_train['datetime'].dt.day
data_train['month'] = data_train['datetime'].dt.month
data_train['year'] = data_train['datetime'].dt.year

In [21]:
data_test['datetime'] = pd.to_datetime(data_test['timestamp'], unit='s')
data_test['hour'] = data_test['datetime'].dt.hour
data_test['minute'] = data_test['datetime'].dt.minute
data_test['second'] = data_test['datetime'].dt.second
data_test['day'] = data_test['datetime'].dt.day
data_test['month'] = data_test['datetime'].dt.month
data_test['year'] = data_test['datetime'].dt.year

In [22]:
data_train.drop(columns=['timestamp' ,'datetime'] , axis=1 , inplace=True)
data_test.drop(columns=['timestamp' ,'datetime' , 'row_id'] , axis=1 , inplace=True)

In [23]:
selected_features = ['volume', 'quote_asset_volume', 'number_of_trades',
       'taker_buy_base_volume', 'taker_buy_quote_volume', 'hour', 'minute']

# Prepare data for modeling
X = data_train[selected_features]
y = data_train['target']

In [24]:
# Create sequences
window_size = 40
X, y = create_sequences(X.values, y.values, window_size)  



# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# check
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

# Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)  
 


X_train shape: (1697918, 40, 7)
y_train shape: (1697918,)
X_test shape: (424480, 40, 7)
y_test shape: (424480,)


In [25]:
X_test_features = data_test[selected_features]

# Generate sequences for the test set
X_test_seq = create_test_sequences(X_test_features.values, window_size)

# Normalize test sequences
X_test_seq = scaler.transform(X_test_seq.reshape(-1, X_test_seq.shape[-1])).reshape(X_test_seq.shape)


In [26]:
model = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=2),
    Dropout(0.1),  # Lower dropout

    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.1),
    LSTM(32),

    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  
])

model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
model.summary()

In [28]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,  
    batch_size=32,  
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

Epoch 1/100
[1m53060/53060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 11ms/step - accuracy: 0.5248 - loss: 0.6918 - val_accuracy: 0.5248 - val_loss: 0.6916
Epoch 2/100
[1m35612/53060[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m3:09[0m 11ms/step - accuracy: 0.5244 - loss: 0.6917

KeyboardInterrupt: 

In [33]:
# Load the best model weights
model.load_weights('best_model.keras')

In [35]:
# Evaluate the model on test data
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).flatten()  

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

[1m13265/13265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step
Accuracy: 0.5248162457595176
F1 Score: 0.0


In [36]:
# Make predictions on the test data
y_test_pred = model.predict(X_test_seq)

# Convert probabilities to binary predictions
y_test_pred_classes = np.round(y_test_pred).flatten()


# Step 1: Create a DataFrame with predictions
predictions_df = pd.DataFrame(y_test_pred_classes, columns=['Predicted_Class'])

# Step 2: Save to CSV file
predictions_df.to_csv('submission.csv', index=False)
print("Predicted classes saved to submission.csv")

[1m28425/28425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 4ms/step
Predicted classes saved to submission.csv
