In [43]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [49]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [54]:
class DataPipeline:
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.original_data = None     # Backup of raw data
        self.removed_rows = None      # Track duplicates if needed

    # Load data
    def load_data(self, file_path):
        df = pd.read_csv(file_path)
        self.original_data = df.copy()   # Save raw copy
        print(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
        return df

    # Clean data
    def clean_data(self, df, fill_strategy="mean"):
        # Remove duplicates
        before_rows = df.shape[0]
        df = df.drop_duplicates()
        after_rows = df.shape[0]
        if before_rows != after_rows:
            print(f"{before_rows - after_rows} duplicate rows removed.")

        # Fill numeric columns
        numeric_cols = df.select_dtypes(include=["number"]).columns
        for col in numeric_cols:
            if df[col].isnull().sum() > 0:
                if fill_strategy == "mean":
                    df[col].fillna(df[col].mean())
                elif fill_strategy == "median":
                    df[col].fillna(df[col].median())

        # Fill categorical columns
        cat_cols = df.select_dtypes(include=["object"]).columns
        for col in cat_cols:
            if df[col].isnull().sum() > 0:
                df[col].fillna("Unknown")

        return df

    # Encode categorical columns
    def encode_data(self, df):
        cat_cols = df.select_dtypes(include=["object"]).columns
        for col in cat_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            self.label_encoders[col] = le
        return df

    # Scale numeric columns (ignore Date)
    def scale_data(self, df):
        numeric_cols = df.select_dtypes(include=["number"]).columns.drop("Date", errors="ignore")
        df[numeric_cols] = self.scaler.fit_transform(df[numeric_cols])
        return df

    # Run complete pipeline
    def run(self, file_path, save_output=True):
        df = self.load_data(file_path)
        df = self.clean_data(df)
        df = self.encode_data(df)
        df = self.scale_data(df)

        if save_output:
            self.original_data.to_csv("original_data.csv", index=False)
            df.to_csv("processed_data.csv", index=False)
            print("📂 Saved 'original_data.csv' and 'processed_data.csv'")

        return df


In [55]:

import yfinance as yf

df = yf.download("RELIANCE.NS", start="2025-01-01", end="2025-01-03")
df.reset_index(inplace=True)  

df.to_csv("reliance_stock.csv", index=False)

pipeline = DataPipeline()
processed_df = pipeline.run("reliance_stock.csv")

print(processed_df.head())


  df = yf.download("RELIANCE.NS", start="2025-01-01", end="2025-01-03")
[*********************100%***********************]  1 of 1 completed

Data loaded: 3 rows, 6 columns
📂 Saved 'original_data.csv' and 'processed_data.csv'
   Date     Close      High       Low      Open    Volume
0     2  1.224745  1.224745  1.224745  1.224745  1.224745
1     0 -1.224745 -1.224745 -1.224745 -1.224745  0.000000
2     1  0.000000  0.000000  0.000000  0.000000 -1.224745



