In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [None]:
# Load the processed data from the previous notebook
data = pd.read_csv('../data/processed/processed_data.csv')

# Display the first few rows to verify it loaded correctly
print(data.head())


In [None]:
# Check for missing values in the dataset
print(data.isnull().sum())

# Fill missing numerical values with the mean
data.fillna(data.mean(), inplace=True)

# Verify that there are no more missing values
print(data.isnull().sum())


In [None]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Example: Convert a categorical column to numerical using LabelEncoder
# Replace 'category_column' with the actual column name
data['category_column'] = label_encoder.fit_transform(data['category_column'])

# If you have multiple categories, you might want to use OneHotEncoder
# data = pd.get_dummies(data, columns=['category_column1', 'category_column2'])


In [None]:
# Example: Create a new feature as the difference between two columns
# Replace 'column1' and 'column2' with actual column names
data['new_feature'] = data['column1'] - data['column2']

# Another example: Create lag features for time series data
# Replace 'target_column' with the actual column name
data['lag_feature'] = data['target_column'].shift(1)

# Drop rows with NaN values introduced by the shift operation
data.dropna(inplace=True)


In [None]:
# Initialize the standard scaler
scaler = StandardScaler()

# Specify the columns you want to scale
numerical_columns = ['feature1', 'feature2', 'feature3']  # Replace with your actual column names
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


In [None]:
# Assuming 'target' is the column you're trying to predict
X = data.drop(columns=['target'])
y = data['target']

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Save the splits as separate files
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)


In [None]:
# Save the entire processed data if not splitting
data.to_csv('../data/processed/processed_data_final.csv', index=False)
