# **A Hybrid LSTM-CNN-Attention Deep Learning Framework with Multi-Method Feature Selection and Sentiment Integration for Robust Stock Price Prediction**

In [None]:
# ==============================================================
# 0. Setup: Kaggle API, Install/Import Libraries, Set Device
# ==============================================================

!pip install -q kaggle
from google.colab import files
files.upload()  # Upload your kaggle.json

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"
!kaggle datasets download -d franciscofeng/augmented-china-stock-data-with-fundamentals
!unzip -q augmented-china-stock-data-with-fundamentals.zip



print("TensorFlow version:", tf.__version__)
# Check GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [1]:
# ========================================================
# 1. Imports and Setup
# ========================================================
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Conv1D, Dense, Dropout, concatenate
from tensorflow.keras.callbacks import EarlyStopping

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ========================================================
# 2. Data Loading
# ========================================================
# Change filenames as needed
df_main = pd.read_csv('/content/stock_data.csv')
df_company = pd.read_csv('/content/ticker_info.csv')

# Merge company names
df_main = df_main.merge(df_company[['ticker', 'company_name']], on='ticker', how='left')

# ========================================================
# 3. Sentiment Integration (replace with real if available)
# ========================================================
if 'sentiment' not in df_main.columns:
    np.random.seed(SEED)
    df_main['sentiment'] = np.random.normal(0, 1, len(df_main))

# ========================================================
# 4. Feature Engineering & Multi-Method Feature Selection
# ========================================================
feature_cols = [
    'open', 'high', 'low', 'volume', 'outstanding_share', 'turnover', 'pe',
    'pe_ttm', 'pb', 'ps', 'ps_ttm', 'dv_ratio', 'dv_ttm', 'total_mv', 'qfq_factor'
]
target_col = 'close'

df_main['date'] = pd.to_datetime(df_main['date'])
df_main = df_main.sort_values(['ticker', 'date'])
df_main['next_close'] = df_main.groupby('ticker')[target_col].shift(-1)
df_main = df_main.dropna(subset=feature_cols + ['next_close'])
df_main[feature_cols] = df_main[feature_cols].fillna(df_main[feature_cols].median())

# Sampling for feature selection (memory safe)
scaler = StandardScaler()
sample_size = min(100000, len(df_main))
X_sample = df_main[feature_cols].sample(sample_size, random_state=SEED)
y_sample = df_main.loc[X_sample.index, 'next_close']
X_scaled = scaler.fit_transform(X_sample)
X_s = pd.DataFrame(X_scaled, columns=feature_cols)

# (a) Pearson Correlation
pearson_scores = np.abs([np.corrcoef(X_s[c], y_sample)[0, 1] for c in feature_cols])
# (b) Mutual Info
mic_scores = mutual_info_regression(X_s, y_sample, random_state=SEED)
# (c) Lasso
lasso = LassoCV(cv=3, random_state=SEED, n_jobs=-1, max_iter=10000)
lasso.fit(X_s, y_sample)
lasso_scores = np.abs(lasso.coef_)
# (d) RFE
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X_s, y_sample)
rfe_scores = -rfe.ranking_
# (e) Null Importance
def get_null_importance(X, y, base_model, n_runs=5):
    base_model.fit(X, y)
    real_importance = np.abs(base_model.coef_) if hasattr(base_model, "coef_") else np.abs(base_model.feature_importances_)
    null_imp = np.zeros((n_runs, len(feature_cols)))
    for i in range(n_runs):
        y_shuffled = shuffle(y, random_state=i)
        base_model.fit(X, y_shuffled)
        this_imp = np.abs(base_model.coef_) if hasattr(base_model, "coef_") else np.abs(base_model.feature_importances_)
        null_imp[i] = this_imp
    return real_importance - null_imp.mean(axis=0)
null_scores = get_null_importance(X_s, y_sample, LinearRegression(), n_runs=5)

score_df = pd.DataFrame({
    'feature': feature_cols,
    'pearson': pearson_scores,
    'mic': mic_scores,
    'lasso': lasso_scores,
    'rfe': rfe_scores,
    'null_importance': null_scores
})
def quartile_rank(series):
    q1, q2 = series.quantile(1/3), series.quantile(2/3)
    return series.apply(lambda v: 'Strong' if v > q2 else ('Medium' if v > q1 else 'Weak'))
for col in ['pearson', 'mic', 'lasso', 'rfe', 'null_importance']:
    score_df[f'{col}_rank'] = quartile_rank(score_df[col])
score_df['num_strong_or_medium'] = score_df[
    [f'{col}_rank' for col in ['pearson', 'mic', 'lasso', 'rfe', 'null_importance']]
].apply(lambda row: sum(r in ['Strong', 'Medium'] for r in row), axis=1)
selected_features = score_df[score_df['num_strong_or_medium'] >= 4]['feature'].tolist()
if 'sentiment' in df_main.columns and 'sentiment' not in selected_features:
    selected_features.append('sentiment')
print("Selected features for modeling:", selected_features)
#display(score_df) # Uncomment if in Jupyter

# ========================================================
# 5. Final Data Prep for Modeling (fill missing, etc)
# ========================================================
df_main[selected_features] = df_main[selected_features].fillna(df_main[selected_features].median())
# (Target already present as 'next_close')

# ========================================================
# 6. Windowed Time-Series Construction
# ========================================================
def create_windowed_dataset(df, selected_features, target_col='next_close', window_size=5):
    X, y, meta = [], [], []
    for ticker in df['ticker'].unique():
        sub = df[df['ticker'] == ticker]
        arr = sub[selected_features].values
        targets = sub[target_col].values
        names = sub['company_name'].values
        for i in range(len(arr) - window_size):
            X.append(arr[i:i+window_size])
            y.append(targets[i+window_size])
            meta.append(names[i+window_size])
    return np.array(X), np.array(y), np.array(meta)
window_size = 5
X_windowed, y_windowed, meta_windowed = create_windowed_dataset(df_main, selected_features, window_size=window_size)
print("Windowed data shape:", X_windowed.shape, y_windowed.shape)

# ========================================================
# 7. Scaling (fit on all windowed features)
# ========================================================
n_samples, n_window, n_feat = X_windowed.shape
X_reshaped = X_windowed.reshape(-1, n_feat)
scaler_final = StandardScaler()
X_scaled = scaler_final.fit_transform(X_reshaped)
X_windowed_scaled = X_scaled.reshape(n_samples, n_window, n_feat)

# ========================================================
# 8. Train/Test Split
# ========================================================
X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
    X_windowed_scaled, y_windowed, meta_windowed, test_size=0.05, random_state=SEED
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# ========================================================
# 9. Hybrid LSTM-CNN-Attention Model
# ========================================================
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros')
        super(Attention, self).build(input_shape)
    def call(self, x):
        e = tf.keras.backend.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)
input_shape = X_train.shape[1:]
inp = Input(shape=input_shape)
x1 = LSTM(64, return_sequences=True)(inp)
x1 = Dropout(0.2)(x1)
x2 = Conv1D(64, kernel_size=1, activation='relu', padding='same')(inp)
x2 = Dropout(0.2)(x2)
x = concatenate([x1, x2])
att_out = Attention()(x)
dense = Dense(32, activation='relu')(att_out)
dense = Dropout(0.1)(dense)
out = Dense(1)(dense)
model = Model(inputs=inp, outputs=out)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

# ========================================================
# 10. Training with Early Stopping
# ========================================================
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=50,
    batch_size=2048,
    callbacks=[early_stop],
    verbose=2
)

# ========================================================
# 11. Evaluation and Insights
# ========================================================
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test MAE: {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2: {r2:.4f}")

print("Sample predictions (Actual vs Predicted, Company):")
for i in range(5):
    print(f"Company: {meta_test[i]}, Actual: {y_test[i]:.2f}, Predicted: {y_pred[i]:.2f}, Error: {y_test[i]-y_pred[i]:.2f}")


ModuleNotFoundError: No module named 'tensorflow'