In [38]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping



In [39]:
# Create a directory to store images and CSV file
output_folder = 'stock_visualizations'
os.makedirs(output_folder, exist_ok=True)



In [40]:
# Load and preprocess the CSV file
df = pd.read_csv('MA-Equities-CM-volume-27-Jul-2024.csv')
df.columns = df.columns.str.strip().str.replace('\n', '').str.replace(' ', '_')

def clean_column(df, column_name, numeric_format=False):
    if numeric_format:
        df[column_name] = df[column_name].replace({'-': np.nan}, regex=True)
        df[column_name] = df[column_name].replace({',': ''}, regex=True)  # Remove commas
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
df.head()


Unnamed: 0,SYMBOL,OPEN,HIGH,LOW,PREV._CLOSE,LTP,%CHNG,VOLUME_(Shares),VALUE,CA
0,GTLINFRA,3.2,3.27,2.96,3.12,2.96,-5.13,1042272779,3293581981.64,21-Sep-2015
1,IDEA,15.42,16.14,15.4,15.18,15.98,5.27,746610397,11841240896.42,07-Jul-2023
2,HCC,53.65,57.5,53.01,53.34,55.34,3.75,122602666,6813030149.62,15-Mar-2024
3,YESBANK,24.67,25.11,24.65,24.62,24.94,1.3,120894253,3011475842.23,03-Jun-2019
4,SJVN,156.53,159.65,147.5,141.04,148.0,4.93,105299846,16139307396.42,21-Feb-2024


In [41]:
# Clean numeric columns
clean_column(df, '%CHNG')
clean_column(df, 'VALUE', numeric_format=True)
clean_column(df, 'VOLUME_(Shares)', numeric_format=True)



In [42]:
# Drop rows with NaN values
df.dropna(subset=['%CHNG', 'VALUE', 'VOLUME_(Shares)'], inplace=True)

# Drop or encode non-numeric columns
stock_names = df['SYMBOL'].unique() if 'SYMBOL' in df.columns else ['Unknown Stock']
df.drop(columns=['SYMBOL'], inplace=True, errors='ignore')
non_numeric_columns = df.select_dtypes(include=[object]).columns

for col in non_numeric_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))


In [43]:
# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('%CHNG', axis=1))
X = pd.DataFrame(scaled_features, columns=df.columns.difference(['%CHNG']))
y = df['%CHNG']
print(X,y)

          CA      HIGH       LOW       LTP      OPEN  PREV._CLOSE     VALUE  \
0  -1.347150 -1.347804 -1.334231 -1.347017 -1.335891     3.428593 -0.892235   
1  -1.146958 -1.145882 -1.125370 -1.143491 -1.123600     2.275655  1.211878   
2  -0.520664 -0.496972 -0.493919 -0.499498 -0.481837    -0.157669 -0.025879   
3  -0.995422 -1.005149 -0.970068 -0.984180 -0.977508    -0.164331 -0.961679   
4   1.164746  1.105693  1.092516  0.980538  1.028980    -0.225141  2.269901   
5   0.705551  0.668745  0.380643  0.640822  0.358031    -0.226581  1.221218   
6   0.386096  0.621520  0.414222  0.203562  0.494014    -0.286043  0.889659   
7   0.257495  0.273374  0.269161  0.312581  0.316453    -0.295509  0.500307   
8   0.004389  0.003518 -0.013069  0.031256 -0.024972    -0.318199 -0.002048   
9   0.295993  0.285926  0.353780  0.321019  0.320530    -0.346609  0.221788   
10 -1.110426 -1.122191 -1.103376 -1.115982 -1.109252    -0.355151 -1.397428   
11 -0.929402 -0.892657 -0.902071 -0.915663 -0.871037

In [44]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Regressor with Grid Search
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
print(f"\nStock: {stock_names[0]}")
print("Best Random Forest Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Best Random Forest R^2 Score:", r2_score(y_test, y_pred))



Fitting 3 folds for each of 12 candidates, totalling 36 fits

Stock: GTLINFRA
Best Random Forest Mean Squared Error: 63.84072052666667
Best Random Forest R^2 Score: -0.3450745528466519


In [45]:
# Prepare data for TimeseriesGenerator
sequence_length = 10  # Define an initial sequence length
data = df[['%CHNG']].values
data


array([[ -5.13],
       [  5.27],
       [  3.75],
       [  1.3 ],
       [  4.93],
       [-11.63],
       [  5.68],
       [  2.8 ],
       [ -1.69],
       [  2.54],
       [  0.3 ],
       [  9.73],
       [  1.02],
       [  4.46],
       [  1.74],
       [  0.05],
       [ -0.11],
       [ -3.08],
       [  6.05]])

In [46]:
# Visualizations
sns.pairplot(df.select_dtypes(include=[float, int]))
plt.savefig(os.path.join(output_folder, 'pairplot.png'))
plt.close()

plt.figure(figsize=(10, 8))
sns.heatmap(df.select_dtypes(include=[float, int]).corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig(os.path.join(output_folder, 'heatmap.png'))
plt.close()



In [47]:
print(f"Length of data: {len(data)}")
sequence_length = min(len(data) // 3, 10)  # Example: use one-third of the data length or 10, whichever is smaller
print(f"Sequence length: {sequence_length}")


Length of data: 19
Sequence length: 6


In [48]:
# Time series cross-validator
tscv = TimeSeriesSplit(n_splits=3)  # Adjust the number of splits based on data size

# Initialize lists to store history and validation losses
history_list = []
val_losses = []

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

for fold, (train_index, val_index) in enumerate(tscv.split(data)):
    print(f"\nFold {fold + 1}:")

    train_data, val_data = data[train_index], data[val_index]

    # Update sequence_length if data is too small
    if len(train_data) <= sequence_length or len(val_data) <= sequence_length:
        sequence_length = min(len(train_data), len(val_data)) - 1
        if sequence_length <= 0:
            print(f"Skipping fold {fold + 1} due to insufficient data.")
            continue  # Skip this fold if sequence_length is not valid

    # Create TimeseriesGenerators
    train_generator = TimeseriesGenerator(train_data, train_data, length=sequence_length, batch_size=1)
    val_generator = TimeseriesGenerator(val_data, val_data, length=sequence_length, batch_size=1)

    print(f"Train generator size: {len(train_generator)}")
    print(f"Validation generator size: {len(val_generator)}")

    # Define and compile the LSTM model
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, activation='relu', input_shape=(sequence_length, 1),dropout=0.2))
    lstm_model.add(Dense(1))
    lstm_model.compile(optimizer='adam', loss='mse')

    # Train the LSTM model
    print("Starting model training...")
    history = lstm_model.fit(train_generator, epochs=20, validation_data=val_generator, callbacks=[early_stopping], verbose=1)

    # Store training history and validation loss
    history_list.append(history.history)
    val_losses.append(history.history['val_loss'][-1])

    # Print final loss values
    print(f"Final training loss for fold {fold + 1}: {history.history['loss'][-1]}")
    print(f"Final validation loss for fold {fold + 1}: {history.history['val_loss'][-1]}")





Fold 1:
Train generator size: 4
Validation generator size: 1
Starting model training...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Final training loss for fold 1: 46.53571319580078
Final validation loss for fold 1: 0.15260127186775208

Fold 2:
Train generator size: 8
Validation generator size: 1
Starting model training...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Final training loss for fold 2: 22.600248336791992
Final validation loss for fold 2: 3.7885756492614746

Fold 3:
Train generator size: 12
Validation generator size: 1
Starting model training...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Final training loss for

In [49]:
# Print average validation loss across folds, if any folds were processed
if val_losses:
    average_val_loss = np.mean(val_losses)
    print(f"\nAverage validation loss across folds: {average_val_loss}")
else:
    print("\nNo valid folds processed.")

# Plot training and validation losses, if any history was collected
if history_list:
    plt.figure(figsize=(12, 6))
    for fold, hist in enumerate(history_list):
        plt.plot(hist['loss'], label=f'Train Loss Fold {fold + 1}')
        plt.plot(hist['val_loss'], label=f'Val Loss Fold {fold + 1}')

    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join(output_folder, 'training_validation_loss.png'))
    plt.close()
else:
    print("No training history to plot.")




Average validation loss across folds: 10.80844991405805


In [50]:
# Predict future values using the best model
future_steps = 10  # Predict 10 future steps

# Prepare a DataFrame to store predictions
all_predictions = []

for stock_name in stock_names:
    # Prepare the data for prediction
    last_sequence = data[-sequence_length:]  # Last sequence of data to base future predictions on

    # Create TimeseriesGenerators for future predictions
    future_data = np.concatenate([data, np.zeros((future_steps, 1))])  # Extend data to hold future steps
    future_generator = TimeseriesGenerator(future_data, future_data, length=sequence_length, batch_size=1)

    # Predict future values
    predictions = []
    for i in range(future_steps):
        pred = lstm_model.predict(future_generator)[-1, 0]
        predictions.append(pred)
        # Append the prediction to the future_data for the next prediction
        future_data = np.concatenate([future_data, np.array([[pred]])])
        future_generator = TimeseriesGenerator(future_data, future_data, length=sequence_length, batch_size=1)

    # Store predictions in DataFrame
    for day, pred in enumerate(predictions, start=1):
        all_predictions.append({'Stock': stock_name, 'Day': day, 'Predicted_%CHNG': pred})

    # Plot current and future predicted values
    plt.figure(figsize=(12, 6))
    plt.plot(np.arange(len(data)), data, label='Historical Data')
    plt.plot(np.arange(len(data), len(data) + future_steps), predictions, label='Future Predictions', linestyle='--')
    plt.xlabel('Time Steps')
    plt.ylabel('% Change')
    plt.title(f'{stock_name} - Historical and Future Predictions')
    plt.legend()
    plt.savefig(os.path.join(output_folder, f'{stock_name}_wave_graph.png'))
    plt.close()

    # Print current and predicted future values
    print(f"\nCurrent stock price change for {stock_name}: {data[-1, 0]:.2f}%")
    print("Predicted future stock price changes:")
    for i, pred in enumerate(predictions, start=1):
        print(f"Day {i}: {pred:.2f}%")




Current stock price change for GTLINFRA: 6.05%
Predicted future stock price changes:
Day 1: 0.33%
Day 2: 0.33%
Day 3: 0.29%
Day 4: 0.28%
Day 5: 0.29%
Day 6: 0.29%
Day 7: 0.29%
Day 8: 0.29%
Day 9: 0.29%
Day 10: 0.29%

Current stock price change for IDEA: 6.05%
Predicted future stock price changes:
Day 1: 0.33%
Day 2: 0.33%
Day 3: 0.29%
Day 4: 0.28%
Day 5: 0.29%
Day 6: 0.29%
Day 7: 0.29%
Day 8: 0.29%
Day 9: 0.29%
Day 10: 0.29%

Current stock price change for HCC: 6.05%
Predicted future stock price changes:
Day 1: 0.33%
Day 2: 0.33%
Day 3: 0.29%
Day 4: 0.28%
Day 5: 0.29%
Day 6: 0.29%
Day 7: 0.29%
Day 8: 0.29%
Day 9: 0.29%
Day 10: 0.29%

Current stock price change for YESBANK: 6.05%
Predicted future stock price changes:
Day 1: 0.33%
Day 2: 0.33%
Day 3: 0.29%
Day 4: 0.28%
Day 5: 0.29%
Day 6: 0.29%
Day 7: 0.29%
Day 8: 0.29%
Day 9: 0.29%
Day 10: 0.29%

Current stock price change for SJVN: 6.05%
Predicted future stock price changes:
Day 1: 0.33%
Day 2: 0.33%
Day 3: 0.29%
Day 4: 0.28%
Day 5: 0

In [51]:
# Convert predictions to DataFrame and save to CSV
predictions_df = pd.DataFrame(all_predictions)
predictions_df.to_csv(os.path.join(output_folder, 'predicted_future_values.csv'), index=False)

print("Predictions have been saved to 'predicted_future_values.csv'.")

Predictions have been saved to 'predicted_future_values.csv'.
