In [50]:


# import yfinance as yf
# from datetime import datetime

# Set the ticker symbol for BSE Sensex and the date range
# symbol = "^BSESN"
# start_date = "2014-01-01"
# end_date = datetime.today().strftime('%Y-%m-%d')

# Download the historical data
# sensex_data = yf.download(symbol, start=start_date, end=end_date, interval="1d")

# Save to CSV
# csv_path = r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\P-SENSEX_PROJECT\PROJECT DATA\sensex_2014_to_today.csv"
# sensex_data.to_csv(csv_path)

# csv_path



In [51]:
# Step 1: Load Data
df = pd.read_csv(r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\P-SENSEX_PROJECT\PROJECT DATA\sensex_2014_to_today.csv")

# Step 2: Fix Date
if 'Date' not in df.columns:
    df['Date'] = pd.to_datetime(df.index)
else:
    df['Date'] = pd.to_datetime(df['Date'])

# Step 3: Get only numeric columns that exist
numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
numeric_cols = [col for col in numeric_cols if col in df.columns]

# Step 4: Convert to float
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 5: Drop rows with any missing values
df.dropna(inplace=True)

# Final columns
df = df[['Date'] + numeric_cols]
df = df.sort_values('Date').reset_index(drop=True)


In [52]:
df.dtypes

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object

In [53]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1970-01-01 00:00:00.000000002,21222.189453,21244.349609,21133.820312,21140.480469,4000.0
1,1970-01-01 00:00:00.000000003,21179.910156,21331.320312,20846.669922,20888.330078,9300.0
2,1970-01-01 00:00:00.000000004,20819.580078,20885.179688,20731.330078,20851.330078,9300.0
3,1970-01-01 00:00:00.000000005,20913.789062,20913.789062,20721.980469,20787.300781,7600.0
4,1970-01-01 00:00:00.000000006,20845.769531,20890.480469,20637.179688,20693.240234,8100.0
...,...,...,...,...,...,...
2825,1970-01-01 00:00:00.000002827,83398.078125,83516.820312,83262.226562,83442.500000,6200.0
2826,1970-01-01 00:00:00.000002828,83387.031250,83812.312500,83320.953125,83712.507812,15300.0
2827,1970-01-01 00:00:00.000002829,83625.890625,83781.359375,83382.281250,83536.078125,7800.0
2828,1970-01-01 00:00:00.000002830,83658.203125,83742.281250,83134.968750,83190.281250,7500.0


In [54]:
# 1️⃣ Predict next day closing price (regression)
df['Next_Close'] = df['Close'].shift(-1)

# 2️⃣ Predict direction (classification: 1 if up, 0 if down/stable)
df['Price_Direction'] = (df['Next_Close'] > df['Close']).astype(int)

# 3️⃣ Predict return percentage
df['Return_%'] = ((df['Next_Close'] - df['Close']) / df['Close']) * 100

# 4️⃣ Predict trading signal (multi-class)
def label_signal(x):
    if x > 0.8:
        return 'BUY'
    elif x < -0.8:
        return 'SELL'
    else:
        return 'HOLD'

df['Signal'] = df['Return_%'].apply(label_signal)


| Target Column     | Purpose                                       |
| ----------------- | --------------------------------------------- |
| `Next_Close`      | Predict next day closing price (regression)   |
| `Price_Direction` | Predict up/down (classification 0/1)          |
| `Return_%`        | Predict percent return (regression)           |
| `Signal`          | Predict action: BUY, SELL, HOLD (multi-class) |


In [55]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Next_Close',
       'Price_Direction', 'Return_%', 'Signal'],
      dtype='object')

In [56]:
# 📌 Feature Engineering
df['SMA_5'] = df['Close'].rolling(window=5).mean()
df['SMA_10'] = df['Close'].rolling(window=10).mean()
df['Price_Range'] = df['High'] - df['Low']
df['Daily_Change_%'] = ((df['Close'] - df['Open']) / df['Open']) * 100
df['Rolling_STD_5'] = df['Close'].rolling(window=5).std()
df['Close/Open'] = df['Close'] / df['Open']


In [57]:
from sklearn.model_selection import train_test_split

# ✅ Step 1: Define Feature Columns
features = [
    'Open', 'High', 'Low', 'Volume',
    'SMA_5', 'SMA_10',
    'Price_Range', 'Daily_Change_%',
    'Rolling_STD_5', 'Close/Open'
]

# ✅ Step 2: Drop rows where any of these are missing
required_cols = features + ['Next_Close', 'Price_Direction', 'Signal']
df.dropna(subset=required_cols, inplace=True)

# ✅ Step 3: Define Input Features and Target Columns
X = df[features]

y_class = df['Price_Direction']    # Binary classification
y_reg = df['Next_Close']           # Regression target
y_signal = df['Signal']            # Multi-class signal

# ✅ Step 4: Split only once (80% train, 20% test)
X_train, X_test, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, shuffle=False
)

# ✅ Step 5: Use same split logic for other targets (preserve order)
y_train_reg = y_reg.iloc[:len(X_train)].reset_index(drop=True)
y_test_reg = y_reg.iloc[len(X_train):].reset_index(drop=True)

y_train_signal = y_signal.iloc[:len(X_train)].reset_index(drop=True)
y_test_signal = y_signal.iloc[len(X_train):].reset_index(drop=True)


📌 1. Linear Regression (Next Day Close Price)

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train_reg)

# Predict
y_pred_reg = lr_model.predict(X_test)

# Evaluation
print("📉 Regression Metrics:")
print("RMSE:", mean_squared_error(y_test_reg, y_pred_reg, squared=False))
print("R² Score:", r2_score(y_test_reg, y_pred_reg))


📉 Regression Metrics:
RMSE: 629.0607796329174
R² Score: 0.9921014186235214




📌 2. Logistic Regression (Price_Direction)

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Model
clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train, y_train_class)

# Predict
y_pred_class = clf_model.predict(X_test)

# Evaluation
print("✅ Classification Report (Price Direction):")
print(classification_report(y_test_class, y_pred_class))


✅ Classification Report (Price Direction):
              precision    recall  f1-score   support

           0       0.46      0.84      0.60       253
           1       0.61      0.21      0.31       311

    accuracy                           0.49       564
   macro avg       0.54      0.52      0.45       564
weighted avg       0.55      0.49      0.44       564



📌 3. Random Forest (BUY / SELL / HOLD)

In [60]:
from sklearn.ensemble import RandomForestClassifier

# Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train_signal)

# Predict
y_pred_signal = rf_model.predict(X_test)

# Evaluation
print("📊 Signal Classification Report:")
print(classification_report(y_test_signal, y_pred_signal))


📊 Signal Classification Report:
              precision    recall  f1-score   support

         BUY       0.00      0.00      0.00        69
        HOLD       0.76      0.99      0.86       430
        SELL       0.00      0.00      0.00        65

    accuracy                           0.76       564
   macro avg       0.25      0.33      0.29       564
weighted avg       0.58      0.76      0.66       564



In [61]:
import joblib

# Step 1: Create folder (only if not done)
import os
os.makedirs("models", exist_ok=True)

# Step 2: Save all 3 models
joblib.dump(lr_model, r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\P-SENSEX_PROJECT\models\reg_model.pkl")      # Regression model
joblib.dump(clf_model, r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\P-SENSEX_PROJECT\models\clf_model.pkl")     # Binary classifier
joblib.dump(rf_model, r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\P-SENSEX_PROJECT\models\signal_model.pkl")   # Multi-class signal


['C:\\Users\\Nagesh Agrawal\\OneDrive\\Desktop\\P-SENSEX_PROJECT\\models\\signal_model.pkl']

In [62]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Next_Close',
       'Price_Direction', 'Return_%', 'Signal', 'SMA_5', 'SMA_10',
       'Price_Range', 'Daily_Change_%', 'Rolling_STD_5', 'Close/Open'],
      dtype='object')