In [4]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset again
df = pd.read_csv('C:/Users/user/Desktop/DA/data/features.csv')

# Handling missing values (Only for numeric columns)
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Convert categorical variables to numerical (One-hot encoding)
df = pd.get_dummies(df, drop_first=True)

# Feature scaling (Normalize the data)
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

# Convert the scaled data back to a DataFrame with proper column names
df_scaled_df = pd.DataFrame(df_scaled, columns=df.columns)

# Ensure the output directory exists
output_dir = 'outputs/processed_data/'
os.makedirs(output_dir, exist_ok=True)

# Save processed data
df_scaled_df.to_csv(os.path.join(output_dir, 'processed_features.csv'), index=False)

# Display the processed data
df_scaled_df.head()



Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Date_2013-05-24,Date_2013-05-31,Date_2013-06-07,Date_2013-06-14,Date_2013-06-21,Date_2013-06-28,Date_2013-07-05,Date_2013-07-12,Date_2013-07-19,Date_2013-07-26
0,0.0,0.454046,0.0501,0.092613,0.034833,0.012958,0.048799,0.005595,0.826259,0.416032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.41926,0.038076,0.092613,0.034833,0.012958,0.048799,0.005595,0.827676,0.416032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.432259,0.021042,0.092613,0.034833,0.012958,0.048799,0.005595,0.828132,0.416032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.493592,0.044589,0.092613,0.034833,0.012958,0.048799,0.005595,0.828429,0.416032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.492402,0.076653,0.092613,0.034833,0.012958,0.048799,0.005595,0.828725,0.416032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define the dataset path
data_path = 'data/features.csv'

# Check if the file exists, and create a placeholder if it doesn't
if not os.path.exists(data_path):
    print(f"File not found: {data_path}. Creating a placeholder file.")
    os.makedirs('data', exist_ok=True)
    placeholder_data = pd.DataFrame({
        'Store': [1, 2, 3],
        'Dept': [1, 2, 3],
        'Date': ['2023-01-01', '2023-01-02', '2023-01-03'],
        'Weekly_Sales': [1000, 1500, 2000],
        'CPI': [211.1, 211.4, 211.7],
        'Unemployment': [8.1, 8.2, 8.3],
        'MarkDown1': [None, 500, None],
        'MarkDown2': [100, None, 200]
    })
    placeholder_data.to_csv(data_path, index=False)
    print(f"Placeholder file created at {data_path}")

# Load the dataset
print(f"Loading dataset from {data_path}...")
df = pd.read_csv(data_path)

# Handling missing values for numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Handling missing values for categorical columns (if any)
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = df[col].fillna('Unknown')  # Replace missing categorical values with 'Unknown'

# Convert categorical variables to numerical (One-hot encoding)
df = pd.get_dummies(df, drop_first=True)

# Feature scaling (Normalize the data)
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

# Convert the scaled data back to a DataFrame with proper column names
df_scaled_df = pd.DataFrame(df_scaled, columns=df.columns)

# Ensure the output directory exists
output_dir = 'outputs/processed_data/'
os.makedirs(output_dir, exist_ok=True)

# Save processed data
output_file = os.path.join(output_dir, 'processed_features.csv')
df_scaled_df.to_csv(output_file, index=False)

# Display the first few rows of the processed data
print("Processed Data Preview:")
print(df_scaled_df.head())

# Validation
print("\nMin and Max values after scaling:")
print(df_scaled_df.describe().loc[['min', 'max']])




File not found: data/features.csv. Creating a placeholder file.
Placeholder file created at data/features.csv
Loading dataset from data/features.csv...
Processed Data Preview:
   Store  Dept  Weekly_Sales  CPI  Unemployment  MarkDown1  MarkDown2  \
0    0.0   0.0           0.0  0.0           0.0        0.0        0.0   
1    0.5   0.5           0.5  0.5           0.5        0.0        0.5   
2    1.0   1.0           1.0  1.0           1.0        0.0        1.0   

   Date_2023-01-02  Date_2023-01-03  
0              0.0              0.0  
1              1.0              0.0  
2              0.0              1.0  

Min and Max values after scaling:
     Store  Dept  Weekly_Sales  CPI  Unemployment  MarkDown1  MarkDown2  \
min    0.0   0.0           0.0  0.0           0.0        0.0        0.0   
max    1.0   1.0           1.0  1.0           1.0        0.0        1.0   

     Date_2023-01-02  Date_2023-01-03  
min              0.0              0.0  
max              1.0              1.0 