In [2]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Dataset\NIFTY-5.csv")

# Parse the 'Date' column
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Sort by company and date
df = df.sort_values(by=['Symbol', 'Date']).reset_index(drop=True)

# Drop exact duplicate rows (by Symbol and Date)
df = df.drop_duplicates(subset=['Symbol', 'Date'])

# Define required columns for cleaning
critical_cols = ['Open', 'High', 'Low', 'Close', 'Volume']

# Remove rows with missing values in critical columns
df = df.dropna(subset=critical_cols)

# Convert numeric columns, handle non-numeric issues
for col in critical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove rows where price or volume columns are negative or zero
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
    df = df[df[col] > 0]

# Forward-fill, then backward-fill missing values per company (if any)
df = df.groupby('Symbol', group_keys=False).apply(lambda g: g.ffill().bfill())



  df = df.groupby('Symbol', group_keys=False).apply(lambda g: g.ffill().bfill())


In [3]:
# --- Target column creation ---
# Create the 'Will_Grow' column: 1 if next day's close is higher, else 0
df['Next_Close'] = df.groupby('Symbol')['Close'].shift(-1)
df['Will_Grow'] = (df['Next_Close'] > df['Close']).astype(int)

# Drop the helper column
df = df.drop(columns=['Next_Close'])

# Remove the last row of each symbol (no future day to compare)
df = df.groupby('Symbol', group_keys=False).apply(lambda g: g.iloc[:-1]).reset_index(drop=True)

# Save cleaned and labeled data
df.to_csv(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Dataset\cleaned_nifty5_labeled.csv", index=False)

print("Saved cleaned and labeled dataset as 'cleaned_nifty5_labeled.csv'. Top rows:")
print(df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume', 'Will_Grow']].head(10))

Saved cleaned and labeled dataset as 'cleaned_nifty5_labeled.csv'. Top rows:
        Date      Symbol    Open    High     Low   Close     Volume  Will_Grow
0 2015-01-01  ADANIPORTS  319.00  322.50  316.25  319.55  1456204.0          0
1 2015-01-04  ADANIPORTS  308.45  312.05  306.10  310.95  1026200.0          1
2 2015-01-06  ADANIPORTS  325.80  325.80  308.25  311.55  3889157.0          0
3 2015-01-07  ADANIPORTS  308.80  312.70  306.50  311.30  2854143.0          1
4 2015-01-09  ADANIPORTS  344.95  354.75  339.00  342.25  3654461.0          0
5 2015-01-10  ADANIPORTS  302.95  307.10  298.50  305.05  4450444.0          0
6 2015-01-12  ADANIPORTS  269.00  269.35  261.10  263.25  5104457.0          1
7 2015-02-01  ADANIPORTS  319.35  325.80  318.05  319.35  2894058.0          1
8 2015-02-02  ADANIPORTS  339.50  343.90  333.00  335.90  1735161.0          0
9 2015-02-03  ADANIPORTS  334.50  335.60  325.60  330.35  2592113.0          0


  df = df.groupby('Symbol', group_keys=False).apply(lambda g: g.iloc[:-1]).reset_index(drop=True)
