
**Step-1 : WebScraping**



In [35]:
import time
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_nifty_50_combined():
    url = 'https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9'

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    time.sleep(5)

    # First, try to find table directly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'tbldata14'))
        )
    except:
        pass  # We'll check iframe fallback next

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', {'class': 'tbldata14'})

    # Try iframes if not found
    if table is None:
        iframes = driver.find_elements(By.TAG_NAME, 'iframe')
        print(f"[Info] Table not found in main page. Found {len(iframes)} iframe(s). Trying each iframe...")
        for i, iframe in enumerate(iframes):
            try:
                driver.switch_to.frame(iframe)
                iframe_soup = BeautifulSoup(driver.page_source, 'html.parser')
                table = iframe_soup.find('table', {'class': 'tbldata14'})
                if table:
                    soup = iframe_soup
                    print(f"[Success] Found table in iframe index {i}.")
                    break
                driver.switch_to.default_content()
            except Exception as e:
                print(f"[Warning] Could not switch to iframe {i}: {e}")
                continue

    # Close driver
    driver.quit()

    # Final fallback: try pandas.read_html
    if table is None:
        print("[Fallback] Trying pandas.read_html...")
        try:
            dfs = pd.read_html(url)
            for df in dfs:
                if 'Stock Name' in df.columns or df.shape[1] >= 6:
                    df['Scraped On'] = datetime.now().strftime('%Y-%m-%d')
                    return df
            raise ValueError("Fallback succeeded but didn't find expected data structure.")
        except Exception as e:
            raise ValueError(f"All methods failed. {e}")

    # Parse the table using BeautifulSoup
    rows = table.find_all('tr')[1:]
    data = []
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 6:
            try:
                data.append({
                    'Stock Name': cols[0].text.strip(),
                    'Last Price': float(cols[1].text.strip().replace(',', '')),
                    'Change': float(cols[2].text.strip().replace(',', '').replace('+', '')),
                    'Change(%)': float(cols[3].text.strip().replace('%', '').replace('+', '')),
                    'Volume': int(cols[4].text.strip().replace(',', '')),
                    'Value(Rs Lakhs)': float(cols[5].text.strip().replace(',', '')),
                    'Scraped On': datetime.now().strftime('%Y-%m-%d')
                })
            except ValueError:
                continue  # Skip bad rows

    return pd.DataFrame(data)

# Run and print
if __name__ == '__main__':
    df_nifty = scrape_nifty_50_combined()
    print(df_nifty.head())


[Info] Table not found in main page. Found 0 iframe(s). Trying each iframe...
[Fallback] Trying pandas.read_html...
                                 Company Name                        Industry  \
0  Adani EnterprisAdd toWatchlist | Portfolio                         Trading   
1      Adani PortsAdd toWatchlist | Portfolio        Transport Infrastructure   
2  Apollo HospitalAdd toWatchlist | Portfolio  Hospital & Healthcare Services   
3     Asian PaintsAdd toWatchlist | Portfolio                          Paints   
4        Axis BankAdd toWatchlist | Portfolio                  Bank - Private   

   Last Price     Chg  %Chg  Mkt Cap (Rs cr)  Scraped On  
0     2511.00   49.85  2.03        289814.78  2025-05-15  
1     1404.70   33.70  2.46        303434.72  2025-05-15  
2     7025.00  108.00  1.56        101008.72  2025-05-15  
3     2326.00   42.35  1.85        223109.41  2025-05-15  
4     1216.15   20.60  1.72        376805.02  2025-05-15  


**Step-2 : Feature_Engineering**

In [36]:
def feature_engineering(df):
    df['Price Category'] = pd.cut(df['Last Price'],
                                  bins=[0, 500, 1000, 2000, df['Last Price'].max()],
                                  labels=['Low', 'Mid', 'High', 'Very High'])

    df['Gain/Loss'] = df['Chg'].apply(lambda x: 'Gain' if x > 0 else ('Loss' if x < 0 else 'No Change'))

    df['Trend Signal'] = df['%Chg'].apply(lambda x:
                                                'Strong Gain' if x >= 2 else
                                                'Mild Gain' if 0 < x < 2 else
                                                'Flat' if x == 0 else
                                                'Mild Loss' if -2 < x < 0 else
                                                'Strong Loss')

    df['Day of Week'] = pd.to_datetime(df['Scraped On']).dt.day_name()

    return df

In [37]:
def saving_file():
    df = scrape_nifty_50_combined()
    df = feature_engineering(df)

    # Save history
    try:
        old_df = pd.read_csv('nifty_50_data_history.csv')
        df = pd.concat([old_df, df], ignore_index=True)
    except FileNotFoundError:
        pass

    df.to_csv('nifty_50_data_history.csv', index=False)
    print("✅ Enhanced data saved with feature engineering.")

# 4. Execute
if __name__ == '__main__':
    saving_file()


[Info] Table not found in main page. Found 0 iframe(s). Trying each iframe...
[Fallback] Trying pandas.read_html...
✅ Enhanced data saved with feature engineering.


In [31]:

df = pd.read_csv('nifty_50_data_history.csv')
print(df.head())


                                 Company Name                        Industry  \
0  Adani EnterprisAdd toWatchlist | Portfolio                         Trading   
1      Adani PortsAdd toWatchlist | Portfolio        Transport Infrastructure   
2  Apollo HospitalAdd toWatchlist | Portfolio  Hospital & Healthcare Services   
3     Asian PaintsAdd toWatchlist | Portfolio                          Paints   
4        Axis BankAdd toWatchlist | Portfolio                  Bank - Private   

   Last Price    Chg  %Chg  Mkt Cap (Rs cr)  Scraped On Price Category  \
0      2507.1  45.95  1.87        289364.65  2025-05-15      Very High   
1      1402.0  31.00  2.26        302851.48  2025-05-15           High   
2      7000.0  83.00  1.20        100649.26  2025-05-15      Very High   
3      2319.3  35.65  1.56        222466.74  2025-05-15      Very High   
4      1209.3  13.75  1.15        374682.66  2025-05-15           High   

  Gain/Loss Trend Signal Day of Week  
0      Gain    Mild Gain    T

In [32]:
import os
print(os.getcwd())


/content


In [33]:
os.listdir()


['.config', 'nifty_50_data_history.csv', 'sample_data']

In [34]:
from google.colab import files
files.download('nifty_50_data_history.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>