In [88]:
import pandas as pd
from datacleaning import clean_cab_data


        


In [66]:
data=pd.read_csv("../Data/rideshare_kaggle.csv")

In [62]:
data.columns

Index(['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'timezone',
       'source', 'destination', 'cab_type', 'product_id', 'name', 'price',
       'distance', 'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperat

In [None]:
def feature_engineering(raw_df:pd.DataFrame)-> pd.DataFrame:
    df=clean_cab_data(raw_df)
    
    return df 

In [67]:
time=data['datetime']

In [89]:
new_fine=clean_cab_data(data)

In [90]:
new_fine.columns

Index(['id', 'hour', 'day', 'month', 'datetime', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax', 'app

In [102]:

new_fine.dtypes

id                                     object
hour                                    int64
day                                     int64
month                                   int64
datetime                               object
timezone                               object
source                                 object
destination                            object
cab_type                               object
product_id                             object
name                                   object
price                                 float64
distance                              float64
surge_multiplier                      float64
latitude                              float64
longitude                             float64
temperature                           float64
apparentTemperature                   float64
short_summary                          object
long_summary                           object
precipIntensity                       float64
precipProbability                 

In [103]:
new_fine['datetime'].dtype

dtype('O')

In [40]:
hour=new_fine['datetime'].dt.hour

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datacleaning import clean_cab_data

# -------------------------------
# TIME FEATURES
# -------------------------------
def add_time_features(df):
    df = df.copy()
    df['datetime']=pd.to_datetime(df['datetime'],unit='s',errors='coerce')
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    df['season'] = df['datetime'].dt.month % 12 // 3 + 1  # 1:Winter,2:Spring...
    df['is_daytime'] = ((df['hour'] >= 6) & (df['hour'] <= 18)).astype(int)
    return df

# -------------------------------
# LOCATION FEATURES
# -------------------------------
def add_location_features(df):
    df = df.copy()
    # Example encoding for source/destination
    for col in ['source', 'destination']:
        le = LabelEncoder()
        df[col + "_encoded"] = le.fit_transform(df[col].astype(str))
    return df  

    

# -------------------------------
# CAB / PRODUCT FEATURES
# -------------------------------
def add_cab_features(df):
    df = df.copy()
    le = LabelEncoder()
    df['cab_type_encoded'] = le.fit_transform(df['cab_type'].astype(str))
    df['surge_flag'] = (df['surge_multiplier'] > 1).astype(int)
    # group products
    df['product_group'] = df['name'].apply(lambda x: "Premium" if "Black" in str(x) else "Standard")
    df['product_group_encoded']=le.fit_transform(df['product_group'].astype(str))
    return df


# -------------------------------
# PRICE FEATURES
# -------------------------------
def add_price_features(df):
    df = df.copy()
    if {'price', 'distance'}.issubset(df.columns):
        df['price_per_km'] = df['price'] / df['distance'].replace(0, np.nan)
        df['log_price'] = np.log1p(df['price'])
    return df



# -------------------------------
# WEATHER FEATURES
# -------------------------------
def add_weather_features(df):
    df = df.copy()
    if 'temperature' in df.columns and 'apparentTemperature' in df.columns:
        df['feels_like'] = df['apparentTemperature'] - df['temperature']
    if 'precipIntensity' in df.columns:
        df['precip_flag'] = (df['precipIntensity'] > 0).astype(int)
    if 'windSpeed' in df.columns:
        df['wind_stress'] = df['windSpeed']**2
    if 'visibility' in df.columns:
        df['visibility_flag'] = (df['visibility'] < 5).astype(int)
    if 'summary' in df.columns:
        df['is_rain'] = df['summary'].str.contains("rain", case=False).astype(int)
        df['is_clear'] = df['summary'].str.contains("clear", case=False).astype(int)
        df['is_cloudy'] = df['summary'].str.contains("cloud", case=False).astype(int)
    return df


# -------------------------------
# SUN / MOON FEATURES
# -------------------------------
def add_sun_moon_features(df):
    df = df.copy()
    if 'hour' not in df.columns:
        df['hour'] = df['datetime'].dt.hour
    df['is_daytime'] = ((df['hour'] >= 6) & (df['hour'] <= 18)).astype(int)
    # Placeholder for moon brightness (if you have moon phase data)
    df['moon_brightness'] = np.random.rand(len(df))  # dummy, replace with real
    return df


# -------------------------------
# MASTER PIPELINE
# -------------------------------
def engineer_features(raw_data:pd.DataFrame)-> pd.DataFrame:
    """Main function to generate all features step by step."""
    cleaned_data=clean_cab_data(raw_data)
    df1 = add_time_features(cleaned_data)
    df2 = add_location_features(df1)
    df3 = add_cab_features(df2)
    df4 = add_price_features(df3)
    df5 = add_weather_features(df4)
    df6 = add_sun_moon_features(df5)
    return df6




In [124]:
engineer_features(data).columns

Index(['id', 'hour', 'day', 'month', 'datetime', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax', 'app

In [None]:
#======================
# Original
#======================
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datacleaning import clean_cab_data





# -------------------------------
# TIME FEATURES
# -------------------------------
def add_time_features(df):
    df = df.copy()
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    df['season'] = df['datetime'].dt.month % 12 // 3 + 1  # 1:Winter,2:Spring...
    df['is_daytime'] = ((df['hour'] >= 6) & (df['hour'] <= 18)).astype(int)
    return df


# -------------------------------
# LOCATION FEATURES
# -------------------------------
def add_location_features(df):
    df = df.copy()
    # Example encoding for source/destination
    for col in ['source', 'destination']:
        le = LabelEncoder()
        df[col + "_encoded"] = le.fit_transform(df[col].astype(str))
    return df

    


# -------------------------------
# CAB / PRODUCT FEATURES
# -------------------------------
def add_cab_features(df):
    df = df.copy()
    le = LabelEncoder()
    df['cab_type_encoded'] = le.fit_transform(df['cab_type'].astype(str))
    df['surge_flag'] = (df['surge_multiplier'] > 1).astype(int)
    # group products
    df['product_group'] = df['name'].apply(lambda x: "Premium" if "Black" in str(x) else "Standard")
    return df


# -------------------------------
# PRICE FEATURES
# -------------------------------
def add_price_features(df):
    df = df.copy()
    if {'price', 'distance'}.issubset(df.columns):
        df['price_per_km'] = df['price'] / df['distance'].replace(0, np.nan)
        df['log_price'] = np.log1p(df['price'])
    return df


# -------------------------------
# WEATHER FEATURES
# -------------------------------
def add_weather_features(df):
    df = df.copy()
    if 'temperature' in df.columns and 'apparentTemperature' in df.columns:
        df['feels_like'] = df['apparentTemperature'] - df['temperature']
    if 'precipIntensity' in df.columns:
        df['precip_flag'] = (df['precipIntensity'] > 0).astype(int)
    if 'windSpeed' in df.columns:
        df['wind_stress'] = df['windSpeed']**2
    if 'visibility' in df.columns:
        df['visibility_flag'] = (df['visibility'] < 5).astype(int)
    if 'summary' in df.columns:
        df['is_rain'] = df['summary'].str.contains("rain", case=False).astype(int)
        df['is_clear'] = df['summary'].str.contains("clear", case=False).astype(int)
        df['is_cloudy'] = df['summary'].str.contains("cloud", case=False).astype(int)
    return df


# -------------------------------
# SUN / MOON FEATURES
# -------------------------------
def add_sun_moon_features(df):
    df = df.copy()
    if 'hour' not in df.columns:
        df['hour'] = df['datetime'].dt.hour
    df['is_daytime'] = ((df['hour'] >= 6) & (df['hour'] <= 18)).astype(int)
    # Placeholder for moon brightness (if you have moon phase data)
    df['moon_brightness'] = np.random.rand(len(df))  # dummy, replace with real
    return df


# -------------------------------
# MASTER PIPELINE
# -------------------------------
def engineer_features(raw_data:pd.DataFrame)-> pd.DataFrame:
    """Main function to generate all features step by step."""
    cleaned_data=clean_cab_data(raw_data)
    df = add_time_features(cleaned_data)
    df = add_location_features(df)
    df = add_cab_features(df)
    df = add_price_features(df)
    df = add_weather_features(df)
    df = add_sun_moon_features(df)
    return df
