# 3. Pre-Processing <a id="data_wrangling"></a>

<a id="contents"></a>
# Table of Contents  
3.1. [Introduction](#introduction) <br>
3.2. [Imports](#imports)  <br>
3.3. [Data Processing](#process)<br>
3.4. [Data Splitting](#split)<br>
3.5. [Save Updated Data](#save)

## 3.1 Introduction<a id="introduction"></a>

The goal of this notebook is to create a cleaned development dataset to be used to complete the modeling step of my project.

## 3.2 Imports<a id="imports"></a>

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/Users/heatheradler/Documents/GitHub/Springboard/Springboard_Projects/Capstone 3/df_eda.csv')
print("Dataset loaded.")

Dataset loaded.


In [3]:
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df.reset_index(drop=True, inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,SourceDataset,RegionID,SizeRank,RegionName,StateName,Date,Value,Quarter
0,ZORI,394913,1,"New York, NY",NY,2015-01-31,2367.192976,2015Q1
1,ZORI,394913,1,"New York, NY",NY,2015-02-28,2382.571737,2015Q1
2,ZORI,394913,1,"New York, NY",NY,2015-03-31,2401.539081,2015Q1
3,ZORI,394425,50,"Buffalo, NY",NY,2015-01-31,805.691732,2015Q1
4,ZORI,394425,50,"Buffalo, NY",NY,2015-02-28,819.385346,2015Q1
...,...,...,...,...,...,...,...,...
9877,ZORDI,394326,607,"Amsterdam, NY",NY,2024-05-31,138.000000,2024Q2
9878,ZORDI,394504,629,"Cortland, NY",NY,2024-04-30,34.000000,2024Q2
9879,ZORDI,394504,629,"Cortland, NY",NY,2024-05-31,31.000000,2024Q2
9880,ZORDI,395084,784,"Seneca Falls, NY",NY,2024-04-30,65.000000,2024Q2


In [4]:
# Extract useful features from the Date column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter

# Drop the original Date column
df = df.drop(columns=['Date'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9882 entries, 0 to 9881
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SourceDataset  9882 non-null   object 
 1   RegionID       9882 non-null   int64  
 2   SizeRank       9882 non-null   int64  
 3   RegionName     9882 non-null   object 
 4   StateName      9882 non-null   object 
 5   Value          9882 non-null   float64
 6   Quarter        9882 non-null   int32  
 7   Year           9882 non-null   int32  
 8   Month          9882 non-null   int32  
dtypes: float64(1), int32(3), int64(2), object(3)
memory usage: 579.2+ KB


## 3.3 Data Pre-processing<a id="process"></a>

In [6]:
# Define the target variable column and feature columns
target_column = 'Value'
feature_columns = df.columns.difference([target_column, 'SourceDataset'])

In [7]:
# Identify numeric and categorical columns
numeric_features = df[feature_columns].select_dtypes(include=['int64', 'float64']).columns
categorical_features = df[feature_columns].select_dtypes(include=['object']).columns

In [8]:
# Preprocessing for numerical data
numeric_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# Create preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Preprocess data
def preprocess_data(df, pipeline, target_column):
    try:
        # Separate features and target
        X = df[feature_columns]
        y = df[target_column]
        
        # Preprocess the features
        print("Preprocessing features...")
        X_preprocessed = pipeline.fit_transform(X)
        
        # Get feature names after preprocessing
        feature_names = numeric_features.tolist() + pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
        
        # Debug: Print the feature names and their count
        print(f"Feature names: {feature_names}")
        print(f"Number of feature names: {len(feature_names)}")
        
        # Check the shape of the transformed features and feature names
        print(f"Shape of transformed features: {X_preprocessed.shape}")
        
        # Convert the preprocessed features back to DataFrame
        X_preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), columns=feature_names)
        
        print(f"Preprocessing completed.")
        
        return X_preprocessed_df, y
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

## 3.4 Data Splitting

In [10]:
# Split data
def split_data(X, y):
    try:
        print("Splitting the data...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Data splitting completed.")
        return X_train, X_test, y_train, y_test
    except Exception as e:
        print(f"An error occurred during data splitting: {e}")
        return None, None, None, None

## 3.5 Save Data

In [11]:
# Save data
def save_data(X_train, X_test, y_train, y_test):
    try:
        print("Saving the split data...")
        X_train.to_csv('X_train_combined.csv', index=False)
        X_test.to_csv('X_test_combined.csv', index=False)
        y_train.to_csv('y_train_combined.csv', index=False)
        y_test.to_csv('y_test_combined.csv', index=False)
        print("Data saved successfully.")
    except Exception as e:
        print(f"An error occurred during data saving: {e}")

# Preprocess the data
X_preprocessed_df, y = preprocess_data(df, pipeline, target_column)

# Ensure the preprocessing was successful before splitting the data
if X_preprocessed_df is not None and y is not None:
    # Split the data
    X_train, X_test, y_train, y_test = split_data(X_preprocessed_df, y)
    
    # Ensure the data splitting was successful before saving the data
    if X_train is not None and X_test is not None and y_train is not None and y_test is not None:
        # Save the split data
        save_data(X_train, X_test, y_train, y_test)
    else:
        print("Data splitting failed, skipping data saving.")
else:
    print("Preprocessing failed, skipping data splitting and saving.")

Preprocessing features...
Feature names: ['RegionID', 'SizeRank', 'RegionName_Albany, NY', 'RegionName_Amsterdam, NY', 'RegionName_Auburn, NY', 'RegionName_Batavia, NY', 'RegionName_Binghamton, NY', 'RegionName_Buffalo, NY', 'RegionName_Corning, NY', 'RegionName_Cortland, NY', 'RegionName_Elmira, NY', 'RegionName_Glens Falls, NY', 'RegionName_Gloversville, NY', 'RegionName_Hudson, NY', 'RegionName_Ithaca, NY', 'RegionName_Jamestown, NY', 'RegionName_Kingston, NY', 'RegionName_Malone, NY', 'RegionName_New York, NY', 'RegionName_Ogdensburg, NY', 'RegionName_Olean, NY', 'RegionName_Oneonta, NY', 'RegionName_Plattsburgh, NY', 'RegionName_Poughkeepsie, NY', 'RegionName_Rochester, NY', 'RegionName_Seneca Falls, NY', 'RegionName_Syracuse, NY', 'RegionName_Utica, NY', 'RegionName_Watertown, NY', 'StateName_NY']
Number of feature names: 30
Shape of transformed features: (9882, 30)
Preprocessing completed.
Splitting the data...
Data splitting completed.
Saving the split data...
Data saved succes