# Apple Stock Prediction - Neural Network Classifier
## Predicting tomorrow's stock direction using sentiment + technical indicators

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Merge Data

In [9]:
# Load your data files
aapl_finance_df = pd.read_csv('../AAPL Data/AAPL_finance_data.csv')
aapl_sentiment_df = pd.read_csv('../AAPL Data/AAPL_avg_sentiment_data.csv')

# Merge on date
merged_aapl_df = pd.merge(aapl_finance_df, aapl_sentiment_df, on='Date', how='inner')

print(f"Dataset shape: {merged_aapl_df.shape}")
print(f"\nColumns: {list(merged_aapl_df.columns)}")
merged_aapl_df.head()

Dataset shape: (252, 16)

Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Stock Name_x', 'price_change', 'target', 'sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment_compound', 'Stock Name_y', 'sentiment_label']


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name_x,price_change,target,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment_compound,Stock Name_y,sentiment_label
0,2021-09-30,143.660004,144.380005,141.279999,141.5,140.478485,89056700,AAPL,,,0.051286,0.851143,0.097571,0.0989,AAPL,Positive
1,2021-10-01,141.899994,142.919998,139.110001,142.649994,141.620163,94639600,AAPL,1.149994,1.0,0.024455,0.872455,0.103182,0.248255,AAPL,Positive
2,2021-10-04,141.759995,142.210007,138.270004,139.139999,138.135513,98322000,AAPL,-3.509995,-1.0,0.0309,0.91,0.0593,0.12283,AAPL,Positive
3,2021-10-05,139.490005,142.240005,139.360001,141.110001,140.091278,80861100,AAPL,1.970001,1.0,0.0422,0.89,0.0678,0.331,AAPL,Positive
4,2021-10-06,139.470001,142.149994,138.369995,142.0,140.974869,83221100,AAPL,0.889999,1.0,0.0122,0.9108,0.077,0.24352,AAPL,Positive


## 2. Feature Engineering

In [10]:
# Create a copy for processing
df = merged_aapl_df.copy()

# Convert Date to datetime and sort chronologically
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Technical indicators (if not already in your finance data)
# Price-based features
if 'Price_Range' not in df.columns:
    df['Price_Range'] = df['High'] - df['Low']
if 'Price_Change' not in df.columns:
    df['Price_Change'] = df['Close'] - df['Open']

# Volume indicators
if 'Volume_MA_5' not in df.columns:
    df['Volume_MA_5'] = df['Volume'].rolling(window=5).mean()
if 'Volume_MA_10' not in df.columns:
    df['Volume_MA_10'] = df['Volume'].rolling(window=10).mean()

# Moving averages
if 'MA_5' not in df.columns:
    df['MA_5'] = df['Close'].rolling(window=5).mean()
if 'MA_10' not in df.columns:
    df['MA_10'] = df['Close'].rolling(window=10).mean()
if 'MA_20' not in df.columns:
    df['MA_20'] = df['Close'].rolling(window=20).mean()

# Momentum
if 'Momentum_5' not in df.columns:
    df['Momentum_5'] = df['Close'].pct_change(periods=5)
if 'Momentum_10' not in df.columns:
    df['Momentum_10'] = df['Close'].pct_change(periods=10)

# Volatility
if 'Volatility_5' not in df.columns:
    df['Volatility_5'] = df['Close'].rolling(window=5).std()
if 'Volatility_10' not in df.columns:
    df['Volatility_10'] = df['Close'].rolling(window=10).std()

# RSI
if 'RSI' not in df.columns:
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

# MACD
if 'MACD' not in df.columns:
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = exp1 - exp2
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

# Target: Will price go UP tomorrow? (CRITICAL: predict TOMORROW using TODAY's data)
df['Tomorrow_Close'] = df['Close'].shift(-1)  # Tomorrow's closing price
df['Target'] = (df['Tomorrow_Close'] > df['Close']).astype(int)  # 1 if up, 0 if down

# Drop rows with NaN
df = df.dropna()

print(f"\nAfter feature engineering: {df.shape}")
print(f"\nTarget distribution:")
print(df['Target'].value_counts())
print(f"\nPercentage of UP days: {df['Target'].mean()*100:.2f}%")


After feature engineering: (232, 32)

Target distribution:
Target
0    117
1    115
Name: count, dtype: int64

Percentage of UP days: 49.57%


## 3. Prepare Features and Target

In [11]:
# Define feature columns
# Adjust these based on what columns are actually in your sentiment data
# Common sentiment columns: 'sentiment_mean', 'sentiment_std', 'sentiment_max', 'sentiment_min'

price_features = ['Open', 'High', 'Low', 'Volume', 'Price_Range', 'Price_Change',
                  'Volume_MA_5', 'Volume_MA_10', 'MA_5', 'MA_10', 'MA_20',
                  'Momentum_5', 'Momentum_10', 'Volatility_5', 'Volatility_10',
                  'RSI', 'MACD', 'Signal_Line']

# Find sentiment columns (adjust column names if needed)
sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower()]
print(f"Found sentiment columns: {sentiment_cols}")

# Combine all features
feature_columns = price_features + sentiment_cols

# Check which features actually exist in your data
feature_columns = [col for col in feature_columns if col in df.columns]
print(f"\nUsing {len(feature_columns)} features:")
print(feature_columns)

X = df[feature_columns].values
y = df['Target'].values

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

Found sentiment columns: ['sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment_compound', 'sentiment_label']

Using 23 features:
['Open', 'High', 'Low', 'Volume', 'Price_Range', 'Price_Change', 'Volume_MA_5', 'Volume_MA_10', 'MA_5', 'MA_10', 'MA_20', 'Momentum_5', 'Momentum_10', 'Volatility_5', 'Volatility_10', 'RSI', 'MACD', 'Signal_Line', 'sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment_compound', 'sentiment_label']

X shape: (232, 23)
y shape: (232,)


## 4. Train/Test Split (Chronological!)

In [12]:
# CRITICAL: Chronological split for time series (80/20)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTrain period: {df['Date'].iloc[0]} to {df['Date'].iloc[split_idx-1]}")
print(f"Test period: {df['Date'].iloc[split_idx]} to {df['Date'].iloc[-1]}")

Train set: 185 samples
Test set: 47 samples

Train period: 2021-10-27 00:00:00 to 2022-07-22 00:00:00
Test period: 2022-07-25 00:00:00 to 2022-09-28 00:00:00


## 5. Feature Scaling (CRITICAL for Neural Networks!)

In [13]:
# Neural networks REQUIRE scaled features!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Mean of scaled features (should be ~0): {X_train_scaled.mean():.6f}")
print(f"Std of scaled features (should be ~1): {X_train_scaled.std():.6f}")

ValueError: could not convert string to float: 'Neutral'

## 6. Build Neural Network

In [None]:
# Build the neural network
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    
    # First hidden layer
    layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.3),  # Prevent overfitting
    
    # Second hidden layer
    layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.3),
    
    # Third hidden layer
    layers.Dense(16, activation='relu'),
    
    # Output layer (binary classification)
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("Model Architecture:")
model.summary()

## 7. Train the Neural Network

In [None]:
# Early stopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,  # Stop if no improvement for 15 epochs
    restore_best_weights=True,
    verbose=1
)

# Train the model
print("Training Neural Network...\n")
history = model.fit(
    X_train_scaled, y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.2,  # Use 20% of training data for validation
    callbacks=[early_stop],
    verbose=1
)

## 8. Evaluate on Test Set

In [None]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)

print("="*70)
print("NEURAL NETWORK RESULTS")
print("="*70)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Make predictions
y_pred_proba = model.predict(X_test_scaled, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Down (0)', 'Up (1)']))

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\nTrue Negatives (correctly predicted DOWN): {cm[0,0]}")
print(f"False Positives (predicted UP, actually DOWN): {cm[0,1]}")
print(f"False Negatives (predicted DOWN, actually UP): {cm[1,0]}")
print(f"True Positives (correctly predicted UP): {cm[1,1]}")

## 9. Visualizations

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Training history - Loss
axes[0, 0].plot(history.history['loss'], label='Training Loss', linewidth=2, color='blue')
axes[0, 0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2, color='orange')
axes[0, 0].set_xlabel('Epoch', fontsize=12)
axes[0, 0].set_ylabel('Loss', fontsize=12)
axes[0, 0].set_title('Model Loss During Training', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Training history - Accuracy
axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2, color='green')
axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2, color='red')
axes[0, 1].set_xlabel('Epoch', fontsize=12)
axes[0, 1].set_ylabel('Accuracy', fontsize=12)
axes[0, 1].set_title('Model Accuracy During Training', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0],
            xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
axes[1, 0].set_xlabel('Predicted', fontsize=12)
axes[1, 0].set_ylabel('Actual', fontsize=12)
axes[1, 0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')

# 4. Prediction probabilities
axes[1, 1].hist(y_pred_proba, bins=30, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Threshold (0.5)')
axes[1, 1].set_xlabel('Predicted Probability (P(Up))', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Distribution of Predictions', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('AAPL_NN_Results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Visualization saved as 'AAPL_NN_Results.png'")

## 10. Compare with Random Forest

In [None]:
# If you ran Random Forest before, compare the results
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"Neural Network Test Accuracy: {test_accuracy*100:.2f}%")
print(f"Your Random Forest Accuracy: ~42%")  # Update with your actual RF accuracy
print("\nKey Differences:")
print("✓ Neural Network uses feature scaling")
print("✓ Neural Network uses dropout and L2 regularization")
print("✓ Neural Network can capture non-linear patterns")
print("✓ Both models properly avoid data leakage!")

## 11. Save the Model

In [None]:
# Save the trained model
model.save('AAPL_NN_Model.keras')
print("✓ Model saved as 'AAPL_NN_Model.keras'")

# Save training history
history_df = pd.DataFrame(history.history)
history_df.to_csv('AAPL_NN_Training_History.csv', index=False)
print("✓ Training history saved as 'AAPL_NN_Training_History.csv'")

## Next Steps to Improve

1. **Try different architectures**: More/fewer layers, different neuron counts
2. **Hyperparameter tuning**: Learning rate, batch size, dropout rate
3. **LSTM networks**: Better for time series data
4. **Ensemble**: Combine Random Forest + Neural Network predictions
5. **More features**: Add market-wide indicators, more technical indicators
6. **Class balancing**: Handle UP/DOWN imbalance if present