In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import streamlit as st
import plotly.graph_objects as go

# Load sample data (Replace with your dataset)
data = pd.read_csv("sample_stock_data.csv")  # Ensure OHLC and Volume columns are present
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Step 1: Handle Missing or Incorrect Values
# Fill missing values using forward fill and backward fill
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

# Step 2: Feature Engineering
# Manual Technical Indicators (Replacing 'ta' Library)
data['SMA_10'] = data['Close'].rolling(window=10).mean()
data['EMA_10'] = data['Close'].ewm(span=10, adjust=False).mean()
data['RSI'] = 100 - (100 / (1 + (data['Close'].diff().clip(lower=0).rolling(window=14).mean() /
                                data['Close'].diff().clip(upper=0).abs().rolling(window=14).mean())))
data['MACD'] = data['Close'].ewm(span=12, adjust=False).mean() - data['Close'].ewm(span=26, adjust=False).mean()
data['Signal_Line'] = data['MACD'].ewm(span=9, adjust=False).mean()

# Add Lag Features for Time-Series Analysis
lags = [1, 3, 5, 7]
for lag in lags:
    data[f"Close_lag_{lag}"] = data['Close'].shift(lag)

# Drop rows with NaN values after adding lag features (introduced by shifting)
data.dropna(inplace=True)

# Step 3: Normalize or Standardize the Data
# Selecting features to scale
features_to_scale = [
    'Close', 'Volume', 'SMA_10', 'EMA_10', 'RSI', 'MACD', 'Signal_Line'
] + [f"Close_lag_{lag}" for lag in lags]

# Initialize Scalers
scaler = MinMaxScaler()  # Use StandardScaler() for standardization
scaled_features = scaler.fit_transform(data[features_to_scale])

# Replace original columns with scaled values
data[features_to_scale] = scaled_features

# Step 3: Exploratory Data Analysis (EDA)
# Visualize trends in stock prices
data['Close'].plot(figsize=(12, 6), title="Stock Price Trends", xlabel="Date", ylabel="Normalized Close Price")
plt.show()

# Visualize seasonality and anomalies (rolling averages)
data['Rolling_Mean'] = data['Close'].rolling(window=30).mean()
data[['Close', 'Rolling_Mean']].plot(figsize=(12, 6), title="Stock Price with Rolling Mean", xlabel="Date", ylabel="Normalized Close Price")
plt.show()

# Correlation analysis between stock price and indicators
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()

# Pairplot for selected features to observe relationships
selected_features = ['Close', 'Volume', 'SMA_10', 'EMA_10', 'RSI', 'MACD']
sns.pairplot(data[selected_features], diag_kind="kde", plot_kws={"alpha": 0.5})
plt.show()

# Step 4: Modeling
# Time-Series Forecasting: ARIMA
arima_model = ARIMA(data['Close'], order=(5, 1, 0))  # Replace with optimized parameters
arima_result = arima_model.fit()
print(arima_result.summary())

# Time-Series Forecasting: LSTM
# Prepare data for LSTM
lookback = 10
X, y = [], []
data_values = data['Close'].values
for i in range(len(data_values) - lookback):
    X.append(data_values[i:i + lookback])
    y.append(data_values[i + lookback])
X, y = np.array(X), np.array(y)

# Reshape for LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Build LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(lookback, 1)),
    Dense(1)
])
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X, y, epochs=20, batch_size=32, verbose=1)

# Classification Models: Random Forest, Gradient Boosting, Logistic Regression
# Create target variable (example: 1 for buy, -1 for sell, 0 for hold)
data['Target'] = np.where(data['Close'].shift(-1) > data['Close'], 1, 0)
data['Target'] = np.where(data['Close'].shift(-1) < data['Close'], -1, data['Target'])

# Prepare features and labels
X = data.drop(columns=['Target', 'Rolling_Mean'])
y = data['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Train Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Evaluate models
print("Random Forest Classifier:")
y_pred_rf = rf_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting Classifier:")
y_pred_gb = gb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

print("Logistic Regression:")
y_pred_log = log_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# Step 5: Recommendation System
# Define thresholds for Buy/Sell/Hold
threshold_buy = 0.6
threshold_sell = -0.6

# Combine predictions with sentiment analysis (if available)
data['Sentiment_Score'] = 0  # Placeholder if sentiment analysis is not performed
if 'sentiment_score' in locals():
    data['Sentiment_Score'] = text_data['sentiment_score']

# Generate recommendations based on model predictions and sentiment analysis
data['Recommendation'] = "Hold"
data.loc[(data['Target'] == 1) & (data['Sentiment_Score'] >= threshold_buy), 'Recommendation'] = "Buy"
data.loc[(data['Target'] == -1) & (data['Sentiment_Score'] <= threshold_sell), 'Recommendation'] = "Sell"

# Print recommendations
data[['Close', 'Sentiment_Score', 'Target', 'Recommendation']].tail(10)

# Sentiment Analysis (Optional)
# Load textual data (e.g., news headlines)
text_data = pd.read_csv("sample_text_data.csv")  # Replace with your dataset
sid = SentimentIntensityAnalyzer()
text_data['sentiment_score'] = text_data['headline'].apply(lambda x: sid.polarity_scores(x)['compound'])
print(text_data.head())

# Final Preprocessed Data
print(data.head())

# Step 6: Dashboard using Streamlit
st.title("Stock Market Recommendation Dashboard")

# Stock Price Trends
st.subheader("Stock Price Trends")
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index, y=data['Close'], mode='lines', name='Close Price'))
fig.add_trace(go.Scatter(x=data.index, y=data['Rolling_Mean'], mode='lines', name='Rolling Mean'))
st.plotly_chart(fig)

# Predicted Recommendations
st.subheader("Predicted Recommendations")
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index, y=data['Target'], mode='markers', name='Predicted Target'))
st.plotly_chart(fig)

# Sentiment Analysis
if 'Sentiment_Score' in data.columns:
    st.subheader("Sentiment Trends")
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data.index, y=data['Sentiment_Score'], mode='lines', name='Sentiment Score'))
    st.plotly_chart(fig)

st.dataframe(data[['Close', 'Sentiment_Score', 'Target', 'Recommendation']].tail(10))


ModuleNotFoundError: No module named 'tensorflow'