In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer # Re-import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Re-import VADER
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import joblib # Import joblib for saving models

# Suppress XGBoost warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Ensure vader_lexicon is downloaded if not already present
try:
    _ = SentimentIntensityAnalyzer()
except LookupError:
    import nltk
    nltk.download('vader_lexicon', download_dir=r'.\venv\nltk_data' if 'venv' in __import__('sys').executable else None) # Adjust path as needed
    print("Downloaded 'vader_lexicon'.")
finally:
    sid_obj = SentimentIntensityAnalyzer()


print("Starting model training and evaluation process...")
print("Model and TF-IDF vectorizer WILL BE SAVED in this run.")

# Load and preprocess data
data = pd.read_csv('stock(updated).csv')
data = data.dropna()
data['date'] = pd.to_datetime(data['date']) # Convert 'date' to datetime objects
data = data.sort_values('date').reset_index(drop=True) # Sort by date and reset index

# Display basic information
print("Dataset Info:")
data.info()
print("\nClass Distribution:")
print(data['stock_trend'].value_counts(normalize=True))

# --- Feature Engineering ---
print("\nPerforming Feature Engineering...")

# Ensure 'combined_text' column exists for text-based features
if 'combined_text' not in data.columns:
    data['combined_text'] = data['headline'].fillna('') + ' ' + data['short_description'].fillna('')
    print("Created 'combined_text' column.")
else:
    print("'combined_text' column already exists.")

# VADER Sentiment Score (re-calculating to ensure consistency and availability)
data['sentiment_score'] = data['combined_text'].apply(lambda text: sid_obj.polarity_scores(text)['compound'])
print("Calculated 'sentiment_score'.")

# Basic Text Features (re-calculating to ensure consistency)
data['text_length'] = data['combined_text'].apply(len)
data['word_count'] = data['combined_text'].apply(lambda x: len(x.split()))
print("Calculated 'text_length' and 'word_count'.")

# Date-based Features (re-calculating to ensure consistency)
data['day_of_week'] = data['date'].dt.dayofweek
data['month'] = data['date'].dt.month
print("Calculated 'day_of_week' and 'month'.")


# TF-IDF Vectorization - IMPORTANT: Fit and save this exact vectorizer
print("Applying TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
data = pd.concat([data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
print(f"TF-IDF Vectorization complete. Added {len(tfidf_df.columns)} features.")


# Adding more sophisticated features based on stock prices
data['price_range'] = data['High_^GSPC'] - data['Low_^GSPC']
data['daily_return'] = data['Close_^GSPC'].pct_change()

# Lagged features for stock prices and sentiment
for lag in range(1, 4): # Lag 1, 2, 3
    data[f'Close_lag_{lag}'] = data['Close_^GSPC'].shift(lag)
    data[f'sentiment_score_lag_{lag}'] = data['sentiment_score'].shift(lag)

# Rolling window features (e.g., moving averages, standard deviation)
data['Close_MA_5'] = data['Close_^GSPC'].rolling(window=5).mean()
data['Volume_MA_5'] = data['Volume_^GSPC'].rolling(window=5).mean()
data['sentiment_MA_5'] = data['sentiment_score'].rolling(window=5).mean()
data['Close_Std_5'] = data['Close_^GSPC'].rolling(window=5).std()

# Fill NaN values created by lagging and rolling features (e.g., with forward fill or mean)
# It's crucial to handle NaNs AFTER creating lagged/rolling features.
data = data.fillna(method='ffill') # Forward fill
data = data.fillna(0) # Fill any remaining NaNs (e.g., if first few rows were NaN due to shift/rolling)
print("Stock-based and lagged features engineered.")


# --- Select Features and Target ---
# Update features list to include ALL engineered features, including TF-IDF terms
features = [
    'sentiment_score', 'text_length', 'word_count', 'day_of_week', 'month',
    'Close_^GSPC', 'High_^GSPC', 'Low_^GSPC', 'Open_^GSPC', 'Volume_^GSPC',
    'price_range', 'daily_return',
    'Close_lag_1', 'Close_lag_2', 'Close_lag_3',
    'sentiment_score_lag_1', 'sentiment_score_lag_2', 'sentiment_score_lag_3',
    'Close_MA_5', 'Volume_MA_5', 'sentiment_MA_5', 'Close_Std_5'
] + list(tfidf_df.columns) # Add all TF-IDF feature names

X = data[features]
y = data['stock_trend']

print(f"\nFinal Feature set shape after engineering: {X.shape}")
print("\nSample of engineered features (first few columns):")
print(X.head())

# --- Time Series Split ---
tscv = TimeSeriesSplit(n_splits=5)

# --- Hyperparameter Tuning with GridSearchCV ---
param_grid = {
    'n_estimators': [100, 200], # Reduced for faster execution. Can expand if more time.
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0.1],
    'min_child_weight': [1]
}

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tscv, scoring='accuracy', n_jobs=-1, verbose=1)

print("\nStarting Hyperparameter Tuning (this may take some time)...")
grid_search.fit(X, y)

print(f"\nBest hyperparameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# --- Evaluate the Best Model on a dedicated Test Set ---
train_size = int(len(data) * 0.8) # 80% for training
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1] # Probabilities for AUC

print("\n--- Model Evaluation on Test Set ---")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Best Model')
plt.show()

# --- Feature Importance ---
feature_importances = best_model.feature_importances_
features_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

print("\n--- Feature Importances (Top 15) ---")
print(features_df.head(15))

plt.figure(figsize=(10, 7))
sns.barplot(x='Importance', y='Feature', data=features_df.head(15))
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# --- Baseline Model for Comparison ---
majority_class = y_train.mode()[0]
baseline_predictions = np.full_like(y_test, fill_value=majority_class)
baseline_accuracy = accuracy_score(y_test, baseline_predictions)

print(f"\n--- Baseline Model (Majority Class Prediction) ---")
print(f"Baseline Accuracy: {baseline_accuracy:.4f}")


# --- Model Saving ---
model_filename = 'xgboost_stock_trend_model.joblib'
tfidf_vectorizer_filename = 'tfidf_vectorizer.joblib'

joblib.dump(best_model, model_filename)
joblib.dump(tfidf_vectorizer, tfidf_vectorizer_filename) # Save the fitted TF-IDF vectorizer

print(f"\nModel successfully saved as '{model_filename}'.")
print(f"TF-IDF Vectorizer successfully saved as '{tfidf_vectorizer_filename}'.")
print("Training and saving process complete.")



Starting model training and saving process (News-Only Features)...
Data loaded and preprocessed.
Features engineered (news-only).
Feature set shape: (1614, 5)

Sample of engineered features (news-only):
   sentiment_score  text_length  word_count  day_of_week  month
0          -0.5267          162          26            0      1
1           0.5859          199          27            0      1
2           0.1280          155          21            0      1
3           0.1280          184          30            0      1
4          -0.6369          154          24            1      1

Starting Hyperparameter Tuning (this may take some time)...
Fitting 5 folds for each of 648 candidates, totalling 3240 fits


  data = data.fillna(method='ffill') # Still fill NaNs from original data or if any are introduced



Best hyperparameters found: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.7}
Best cross-validation accuracy: 0.5814

Best model trained on the training data.

Model successfully saved as 'xgboost_model_news_only.joblib' in the current directory.
Training and saving process complete.

--- Model Evaluation on Test Set (News-Only Model) ---


NameError: name 'accuracy_score' is not defined