In [17]:
import pandas as pd
df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\Data\merged_big.csv')
df.drop(columns=['Ticker_x', 'Ticker_y'], inplace=True)
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'SMA_14', 'EMA_14', 'RSI_14',
       'ATR_14', 'STOCH_K', 'STOCH_D', 'High_pct_change', 'Low_pct_change',
       'Close_pct_change', 'INDEX_FUTURES_Buy_Amount in Crore',
       'INDEX_FUTURES_Buy_No. of Contracts',
       'INDEX_FUTURES_OI_Amount in Crore', 'INDEX_FUTURES_OI_No. of Contracts',
       'INDEX_FUTURES_Sell_Amount in crore',
       'INDEX_FUTURES_Sell_No. of Contracts',
       'INDEX_OPTIONS_Buy_Amount in Crore',
       'INDEX_OPTIONS_Buy_No. of Contracts',
       'INDEX_OPTIONS_OI_Amount in Crore', 'INDEX_OPTIONS_OI_No. of Contracts',
       'INDEX_OPTIONS_Sell_Amount in crore',
       'INDEX_OPTIONS_Sell_No. of Contracts',
       'INTEREST_RATE_FUTURES_Buy_Amount in Crore',
       'INTEREST_RATE_FUTURES_Buy_No. of Contracts',
       'INTEREST_RATE_FUTURES_OI_Amount in Crore',
       'INTEREST_RATE_FUTURES_OI_No. of Contracts',
       'INTEREST_RATE_FUTURES_Sell_Amount in crore',
       'INTEREST_RATE_FUTURES_Sell_No. of Cont

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

df['target'] = (df['Close'] > df['Open']).astype(int)

feature_cols = df.columns.difference(['Date','Ticker_x','Ticker_y','Close','target'])
lagged_df = df[feature_cols].shift(1)
lagged_df['Open'] = df['Open']  
lagged_df['target'] = df['target']
lagged_df.dropna(inplace=True)

X = lagged_df.drop(columns=['target'])
X = X.select_dtypes(include=[np.number])
y = lagged_df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_pca, y_train)

model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(X_train_bal, y_train_bal)
y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Down','Up']))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

results_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
}, index=y_test.index)
print(results_df)  

Classification Report:
              precision    recall  f1-score   support

        Down       0.59      0.62      0.61        58
          Up       0.53      0.50      0.52        50

    accuracy                           0.56       108
   macro avg       0.56      0.56      0.56       108
weighted avg       0.56      0.56      0.56       108

Confusion Matrix:
[[36 22]
 [25 25]]
     Actual  Predicted
623       0          0
135       1          0
251       1          1
111       1          0
144       0          1
..      ...        ...
632       1          0
27        1          0
186       0          0
138       1          0
381       0          0

[108 rows x 2 columns]


In [12]:
results_df

Unnamed: 0,Actual,Predicted
623,0,0
135,1,0
251,1,1
111,1,0
144,0,1
...,...,...
632,1,0
27,1,0
186,0,0
138,1,0


n-fold + PCA + SVM

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\Data\merged_big.csv')
df.drop(columns=['Ticker_x', 'Ticker_y'], inplace=True)
df['target'] = (df['Close'] > (1.003*df['Open'])).astype(int) #add level diff 0.3 pct of nifty
feature_cols = df.columns.difference(['Date','Close','target'])
lagged_df = df[feature_cols].shift(1)
lagged_df['Open'] = df['Open']
lagged_df['target'] = df['target']
lagged_df['Date'] = df['Date']
lagged_df.dropna(inplace=True)

X = lagged_df.drop(columns=['target']).select_dtypes(include=[np.number])
y = lagged_df['target']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('smote', SMOTE(random_state=42)),
    ('svc', SVC())
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.01, 0.1, 1],
    # 'svc__kernel': ['rbf']
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid.fit(X, y)

best_model = grid.best_estimator_
y_pred = best_model.predict(X)

print("Best Params:", grid.best_params_)
print(classification_report(y, y_pred, target_names=['0','1']))
print(confusion_matrix(y, y_pred))

results_df = pd.DataFrame({'Date': lagged_df['Date'].values,'Actual': y.values, 'Predicted': y_pred}, index=y.index)

Best Params: {'svc__C': 10, 'svc__degree': 3, 'svc__gamma': 1, 'svc__kernel': 'poly'}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       392
           1       0.98      1.00      0.99       147

    accuracy                           0.99       539
   macro avg       0.99      1.00      0.99       539
weighted avg       0.99      0.99      0.99       539

[[389   3]
 [  0 147]]


In [23]:
results_df.to_csv(r'predictions.csv', index=False)

In [24]:
import joblib

joblib.dump(best_model, r"C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\model_svm.pkl")
print("Model saved successfully!")


Model saved successfully!


In [None]:
import joblib
import pandas as pd
import numpy as np

model = joblib.load(r"C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\model_svm.pkl")
print("Model loaded successfully!")
new_df = pd.read_csv(r'live_data.csv')
feature_cols = new_df.columns.difference(['Date','Close','target'])

lagged_live = new_df[feature_cols].shift(1)
lagged_live['Open'] = new_df['Open']
lagged_live.dropna(inplace=True)

X_live = lagged_live.select_dtypes(include=[np.number])

y_pred_live = model.predict(X_live)
print("Prediction:", y_pred_live)
results_live = pd.DataFrame({'Date': new_df['Date'].iloc[lagged_live.index].values,
                             'Predicted': y_pred_live},
                             index=lagged_live.index)
results_live.to_csv(r"C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\live_predictions.csv", index=False)
