# Data Preprocessing

This notebook handles the preprocessing of raw data for the Multidisciplinary Deepfake Detection product. It includes steps for loading the raw data, cleaning it, encoding categorical features, normalizing numerical features, and splitting the data into training and testing sets.

In [None]:
# To import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
import logging

# To set up logging
logging.basicConfig(filename='../logs/data_preprocessing.log', level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# To load configuration
from src.config import Config

# To define paths
raw_data_path = os.path.join(Config.RAW_DATA_DIR, 'sample_data.csv')
processed_data_path = os.path.join(Config.PROCESSED_DATA_DIR, 'processed_data.csv')

logging.info("Data preprocessing started.")

# To load raw data
logging.info("Loading raw data from {}.".format(raw_data_path))
data = pd.read_csv(raw_data_path)
logging.info("Raw data loaded successfully with shape {}.".format(data.shape))

# To drop missing values
logging.info("Dropping missing values.")
data.dropna(inplace=True)
logging.info("Missing values dropped. Data shape is now {}.".format(data.shape))

# To encode categorical features
logging.info("Encoding categorical features.")
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le
logging.info("Categorical features encoded successfully.")

# To normalize numerical features
logging.info("Normalizing numerical features.")
scaler = StandardScaler()
numerical_features = data.select_dtypes(include=[np.number]).columns
data[numerical_features] = scaler.fit_transform(data[numerical_features])
logging.info("Numerical features normalized successfully.")

# To split data into training and testing sets
logging.info("Splitting data into training and testing sets.")
X = data.drop('label', axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=Config.RANDOM_SEED)
logging.info("Data split completed. Training data shape: {}, Testing data shape: {}.".format(X_train.shape, X_test.shape))

# To save processed data
logging.info("Saving processed data to {}.".format(processed_data_path))
processed_data = pd.concat([X_train, y_train], axis=1)
processed_data.to_csv(processed_data_path, index=False)
logging.info("Processed data saved successfully.")

logging.info("Data preprocessing completed.")

# To display first few rows of the processed data
processed_data.head()