In [47]:
import pandas as pd
df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\outsample.csv')
df['proxy_target'] = (df['Close'] > (1.003*df['Open'])).astype(int)
# df['proxy_target'] = (df['Close'] > (df['Open'])).astype(int)
df['proxy_target'].value_counts()

proxy_target
0    74
1    35
Name: count, dtype: int64

In [48]:
import pandas as pd
df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\outsample.csv')
df['target'] = (df['Close'] > (df['Open'])).astype(int)
df['target'].value_counts()

target
0    61
1    48
Name: count, dtype: int64

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\insample.csv')
df.drop(columns=['Ticker_x', 'Ticker_y'], inplace=True)
df['target'] = (df['Close'] > (1.003*df['Open'])).astype(int) #add level diff 0.3 pct of nifty
feature_cols = df.columns.difference(['Date','Close','target'])
lagged_df = df[feature_cols].shift(1)
lagged_df['Open'] = df['Open']
lagged_df['target'] = df['target']
lagged_df['Date'] = df['Date']
lagged_df.dropna(inplace=True)

X = lagged_df.drop(columns=['target']).select_dtypes(include=[np.number])
y = lagged_df['target']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('smote', SMOTE(random_state=42)),
    ('svc', SVC())
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.01, 0.1, 1],
    # 'svc__kernel': ['rbf']
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid.fit(X, y)

best_model = grid.best_estimator_
y_pred = best_model.predict(X)

print("Best Params:", grid.best_params_)
print(classification_report(y, y_pred, target_names=['0','1']))
print(confusion_matrix(y, y_pred))

results_df = pd.DataFrame({'Date': lagged_df['Date'].values,'Actual': y.values, 'Predicted': y_pred}, index=y.index)

Best Params: {'svc__C': 10, 'svc__degree': 2, 'svc__gamma': 1, 'svc__kernel': 'poly'}
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       333
           1       0.70      0.86      0.77       114

    accuracy                           0.87       447
   macro avg       0.82      0.87      0.84       447
weighted avg       0.88      0.87      0.87       447

[[290  43]
 [ 16  98]]


In [11]:
import joblib

joblib.dump(best_model, r"C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\model_svm_insample.pkl")
print("Model saved successfully!")

Model saved successfully!


In [None]:
import joblib
import pandas as pd
import numpy as np

model = joblib.load(r"C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\model_svm_insample.pkl")
print("Model loaded successfully!")

new_df = pd.read_csv(r'C:\Users\parul.kakade\Documents\NIFTY_Prediction\6.SVM\outsample.csv')
# print("First 5 rows:\n", new_df.head())
new_df['Date'] = pd.to_datetime(new_df['Date'])


drop_cols = [c for c in ['Date','Close','target'] if c in new_df.columns]
feature_cols = new_df.columns.difference(drop_cols)

lagged_live = new_df[feature_cols].shift(1)
lagged_live['Open'] = new_df['Open']
lagged_live.dropna(inplace=True)

X_live = lagged_live.select_dtypes(include=[np.number])

y_pred_live = model.predict(X_live)

results_live = pd.DataFrame({
    'Date': new_df.loc[lagged_live.index, 'Date'].values,
    'Predicted': y_pred_live
}, index=lagged_live.index)

print(results_live.head())
print(f"Predictions generated for {results_live['Date'].min()} to {results_live['Date'].max()}")


Model loaded successfully!
        Date  Predicted
2 2025-01-03          0
3 2025-01-06          1
4 2025-01-07          0
5 2025-01-08          1
6 2025-01-09          0
✅ Predictions generated for 2025-01-03 00:00:00 to 2025-05-30 00:00:00


In [19]:
results_live.to_csv(r'outsample_pred.csv', index=False)

In [27]:
results_live['Predicted'].value_counts()

Predicted
0    61
1    29
Name: count, dtype: int64

In [None]:
results_live

In [44]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

new_df['proxy_target'] = (new_df['Close'] > (1.003*new_df['Open'])).astype(int)
# new_df['proxy_target'] = (new_df['Close'] > (new_df['Open'])).astype(int)
y_true_live = new_df.loc[lagged_live.index, 'proxy_target']

print("Accuracy:", accuracy_score(y_true_live, y_pred_live))
print("Confusion Matrix:\n", confusion_matrix(y_true_live, y_pred_live))
print("\nClassification Report:\n", classification_report(y_true_live, y_pred_live))


Accuracy: 0.5333333333333333
Confusion Matrix:
 [[39 20]
 [22  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.66      0.65        59
           1       0.31      0.29      0.30        31

    accuracy                           0.53        90
   macro avg       0.47      0.48      0.47        90
weighted avg       0.53      0.53      0.53        90



In [43]:
df = pd.concat([new_df['proxy_target'],results_live], join='outer',axis=1)

df

Unnamed: 0,proxy_target,Date,Predicted
0,1,NaT,
1,1,NaT,
2,0,2025-01-03,0.0
3,0,2025-01-06,1.0
4,1,2025-01-07,0.0
...,...,...,...
104,1,2025-05-26,0.0
105,0,NaT,
106,0,2025-05-28,0.0
107,1,2025-05-29,1.0
