In [2]:
import pandas as pd 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
train.info()     # column names + data types + missing values count
train.shape      # number of rows and columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


(1458644, 11)

In [7]:
# -------------------------------
# STEP 1: Import & Audit the Data
# -------------------------------

import pandas as pd
import numpy as np
import os
os.chdir(r"C:\New folder7")  # change to folder where files are
print("Now in:", os.getcwd())

# 1) Load files (adjust paths if needed)
train_path = "train.csv"   # or r"C:\path\to\train.csv"
test_path  = "test.csv"    # or r"C:\path\to\test.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Shapes -> train:", train.shape, " | test:", test.shape)

# 2) Quick peeks
print("\n=== TRAIN HEAD ===")
display(train.head(10))
print("\n=== TEST  HEAD ===")
display(test.head(10))

# 3) Column comparison (what’s only in train vs only in test)
train_cols = set(train.columns)
test_cols  = set(test.columns)
print("\nColumns only in TRAIN:", sorted(list(train_cols - test_cols)))
print("Columns only in TEST :", sorted(list(test_cols - train_cols)))

# 4) Dtypes & non-null counts
print("\n=== TRAIN INFO ===")
display(train.dtypes)
print("\nMissing values per column (TRAIN):")
display(train.isna().sum().sort_values(ascending=False))

print("\n=== TEST  INFO ===")
display(test.dtypes)
print("\nMissing values per column (TEST):")
display(test.isna().sum().sort_values(ascending=False))

# 5) Try to parse common datetime columns if present
dt_candidates = [c for c in train.columns if "time" in c.lower() or "date" in c.lower()]
for c in dt_candidates:
    try:
        train[c] = pd.to_datetime(train[c], errors="coerce", utc=True)
    except Exception: 
        pass
dt_candidates_test = [c for c in test.columns if "time" in c.lower() or "date" in c.lower()]
for c in dt_candidates_test:
    try:
        test[c] = pd.to_datetime(test[c], errors="coerce", utc=True)
    except Exception:
        pass

print("\nDatetime-like columns found (TRAIN):", dt_candidates)
print("Datetime-like columns found (TEST) :", dt_candidates_test)

# 6) Basic statistics (numerical columns)
print("\n=== TRAIN describe() (numeric) ===")
display(train.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T)

# 7) Sanity checks for coordinates and duration (if present)
def has_any(cols, names):
    return any(name in cols for name in names)

lat_names  = ["pickup_latitude","dropoff_latitude","lat","latitude"]
lon_names  = ["pickup_longitude","dropoff_longitude","lon","longitude","lng"]

# Helper to find first matching column by candidates
def find_col(cols, candidates):
    for name in candidates:
        if name in cols:
            return name
    return None

train_cols_list = list(train.columns)
test_cols_list  = list(test.columns)

# Try to detect coordinate columns
pickup_lat  = find_col(train_cols_list, ["pickup_latitude","start_lat","lat_pickup","pickup_lat"])
pickup_lon  = find_col(train_cols_list, ["pickup_longitude","start_lon","lng_pickup","pickup_lon"])
dropoff_lat = find_col(train_cols_list, ["dropoff_latitude","end_lat","lat_dropoff","dropoff_lat"])
dropoff_lon = find_col(train_cols_list, ["dropoff_longitude","end_lon","lng_dropoff","dropoff_lon"])

coord_cols = [c for c in [pickup_lat,pickup_lon,dropoff_lat,dropoff_lon] if c is not None]
if coord_cols:
    # Valid ranges: lat in [-90, 90], lon in [-180, 180]
    def invalid_lat(s): return (~s.between(-90, 90)) | s.isna()
    def invalid_lon(s): return (~s.between(-180, 180)) | s.isna()

    invalid_counts = {}
    if pickup_lat:  invalid_counts[pickup_lat]  = invalid_lat(train[pickup_lat]).sum()
    if pickup_lon:  invalid_counts[pickup_lon]  = invalid_lon(train[pickup_lon]).sum()
    if dropoff_lat: invalid_counts[dropoff_lat] = invalid_lat(train[dropoff_lat]).sum()
    if dropoff_lon: invalid_counts[dropoff_lon] = invalid_lon(train[dropoff_lon]).sum()

    print("\nInvalid coordinate counts (TRAIN):", invalid_counts)

# 8) Target existence and quick checks
# Common target names for trip time
possible_targets = ["trip_duration","duration","travel_time","period_of_trip","trip_time_min","trip_time_sec"]
target_col = find_col(train_cols_list, possible_targets)

print("\nDetected target column:", target_col)

if target_col:
    # Negative/zero durations, extreme outliers
    neg_or_zero = (train[target_col] <= 0).sum()
    print(f"Non-positive durations in TRAIN ({target_col}):", neg_or_zero)

    # Basic outlier flagging by percentile
    q1 = train[target_col].quantile(0.25)
    q3 = train[target_col].quantile(0.75)
    iqr = q3 - q1
    hi = q3 + 3*iqr
    lo = max(q1 - 3*iqr, 0)
    outlier_hi = (train[target_col] > hi).sum()
    outlier_lo = (train[target_col] < lo).sum()
    print(f"Potential high outliers > {hi:.2f}: {outlier_hi}")
    print(f"Potential low  outliers < {lo:.2f}: {outlier_lo}")

# 9) Duplicate checks (by id if present, else full row)
id_col = find_col(train_cols_list, ["id","trip_id","ride_id"])
if id_col:
    dup_ids_train = train[id_col].duplicated().sum()
    print(f"\nDuplicate IDs in TRAIN ({id_col}):", dup_ids_train)
    if id_col in test.columns:
        dup_ids_test = test[id_col].duplicated().sum()
        print(f"Duplicate IDs in TEST  ({id_col}):", dup_ids_test)
else:
    print("\nNo obvious ID column found — skipping ID-duplicate check.")
    print("Row-level duplicates (TRAIN):", train.duplicated().sum())
    print("Row-level duplicates (TEST) :", test.duplicated().sum())

# 10) Quick null heatmap hint (optional visualization later):
# You can visualize with: train.isna().mean().sort_values(ascending=False).head(20)
print("\nTop 20 columns by missing ratio (TRAIN):")
display((train.isna().mean().sort_values(ascending=False).head(20) * 100).round(2).astype(str) + "%")

print("\nTop 20 columns by missing ratio (TEST):")
display((test.isna().mean().sort_values(ascending=False).head(20) * 100).round(2).astype(str) + "%")

print("\n✅ Step 1 audit complete. Review outputs above, then proceed to Step 2 (Feature Engineering).")




Now in: C:\New folder7
Shapes -> train: (1458644, 11)  | test: (625134, 9)

=== TRAIN HEAD ===


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443
6,id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,N,341
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,N,1551
8,id1301050,1,2016-05-27 23:12:23,2016-05-27 23:16:38,1,-73.999481,40.7384,-73.985786,40.732815,N,255
9,id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973,40.789989,N,1225



=== TEST  HEAD ===


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N
5,id0668992,1,2016-06-30 23:59:30,1,-73.991302,40.749798,-73.980515,40.786549,N
6,id1765014,1,2016-06-30 23:59:15,1,-73.97831,40.74155,-73.952072,40.717003,N
7,id0898117,1,2016-06-30 23:59:09,2,-74.012711,40.701527,-73.986481,40.719509,N
8,id3905224,2,2016-06-30 23:58:55,2,-73.992332,40.730511,-73.875618,40.875214,N
9,id1543102,2,2016-06-30 23:58:46,1,-73.993179,40.74876,-73.979309,40.761311,N



Columns only in TRAIN: ['dropoff_datetime', 'trip_duration']
Columns only in TEST : []

=== TRAIN INFO ===


id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object


Missing values per column (TRAIN):


id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64


=== TEST  INFO ===


id                     object
vendor_id               int64
pickup_datetime        object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
dtype: object


Missing values per column (TEST):


id                    0
vendor_id             0
pickup_datetime       0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
dtype: int64


Datetime-like columns found (TRAIN): ['pickup_datetime', 'dropoff_datetime']
Datetime-like columns found (TEST) : ['pickup_datetime']

=== TRAIN describe() (numeric) ===


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
vendor_id,1458644.0,1.53495,0.498777,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0
passenger_count,1458644.0,1.66453,1.314242,0.0,1.0,1.0,1.0,1.0,2.0,5.0,6.0,9.0
pickup_longitude,1458644.0,-73.973486,0.070902,-121.933342,-74.014317,-74.006866,-73.991867,-73.981743,-73.967331,-73.891582,-73.782227,-61.33553
pickup_latitude,1458644.0,40.750921,0.032881,34.359695,40.644825,40.708141,40.737347,40.754101,40.76836,40.788387,40.806599,51.88108
dropoff_longitude,1458644.0,-73.973416,0.070643,-121.933304,-74.015274,-74.00753,-73.991325,-73.979752,-73.963013,-73.920181,-73.790482,-61.33553
dropoff_latitude,1458644.0,40.7518,0.035891,32.181141,40.645271,40.699921,40.735885,40.754524,40.76981,40.797508,40.83675,43.92103
trip_duration,1458644.0,959.492273,5237.431724,1.0,87.0,180.0,397.0,662.0,1075.0,2104.0,3440.0,3526282.0



Invalid coordinate counts (TRAIN): {'pickup_latitude': np.int64(0), 'pickup_longitude': np.int64(0), 'dropoff_latitude': np.int64(0), 'dropoff_longitude': np.int64(0)}

Detected target column: trip_duration
Non-positive durations in TRAIN (trip_duration): 0
Potential high outliers > 3109.00: 20964
Potential low  outliers < 0.00: 0

Duplicate IDs in TRAIN (id): 0
Duplicate IDs in TEST  (id): 0

Top 20 columns by missing ratio (TRAIN):


id                    0.0%
vendor_id             0.0%
pickup_datetime       0.0%
dropoff_datetime      0.0%
passenger_count       0.0%
pickup_longitude      0.0%
pickup_latitude       0.0%
dropoff_longitude     0.0%
dropoff_latitude      0.0%
store_and_fwd_flag    0.0%
trip_duration         0.0%
dtype: object


Top 20 columns by missing ratio (TEST):


id                    0.0%
vendor_id             0.0%
pickup_datetime       0.0%
passenger_count       0.0%
pickup_longitude      0.0%
pickup_latitude       0.0%
dropoff_longitude     0.0%
dropoff_latitude      0.0%
store_and_fwd_flag    0.0%
dtype: object


✅ Step 1 audit complete. Review outputs above, then proceed to Step 2 (Feature Engineering).


In [8]:
# --------------------------------------
# STEP 2: Feature Engineering (Trip Time)
# --------------------------------------

import pandas as pd
import numpy as np

# ---------- CONFIG ----------
train_path = "train.csv"
test_path  = "test.csv"

# Candidate names so code adapts to your schema
CAND_ID          = ["id","trip_id","ride_id","row_id"]
CAND_TARGET      = ["trip_duration","duration","travel_time","period_of_trip",
                    "trip_time_min","trip_time_sec"]
CAND_PICKUP_LAT  = ["pickup_latitude","start_lat","lat_pickup","pickup_lat"]
CAND_PICKUP_LON  = ["pickup_longitude","start_lon","lng_pickup","pickup_lon","pickup_longitude"]
CAND_DROPOFF_LAT = ["dropoff_latitude","end_lat","lat_dropoff","dropoff_lat"]
CAND_DROPOFF_LON = ["dropoff_longitude","end_lon","lng_dropoff","dropoff_lon","dropoff_longitude"]
CAND_DT          = ["pickup_datetime","start_time","start_datetime","datetime","timestamp","trip_start_time"]

# ---------- HELPERS ----------
def find_col(cols, candidates):
    cols_lower = {c.lower(): c for c in cols}
    for name in candidates:
        if name.lower() in cols_lower:
            return cols_lower[name.lower()]
    return None

def to_datetime_col(df, candidates):
    c = find_col(df.columns, candidates)
    if c is None:
        return None
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True)
    return c

def haversine_km(lat1, lon1, lat2, lon2):
    # All args in degrees
    R = 6371.0088  # mean Earth radius in km
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlmb = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlmb/2)**2
    return 2*R*np.arcsin(np.sqrt(a))

def bearing_deg(lat1, lon1, lat2, lon2):
    # Forward azimuth (0-360)
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dlmb = np.radians(lon2 - lon1)
    y = np.sin(dlmb) * np.cos(phi2)
    x = np.cos(phi1)*np.sin(phi2) - np.sin(phi1)*np.cos(phi2)*np.cos(dlmb)
    brng = np.degrees(np.arctan2(y, x))
    return (brng + 360) % 360

def clip_series(s, lo=None, hi=None):
    if lo is not None: s = s.clip(lower=lo)
    if hi is not None: s = s.clip(upper=hi)
    return s

def build_time_features(df, dt_col):
    # hour, weekday, month, weekend + cyclical encodings
    df["hour"]       = df[dt_col].dt.hour
    df["weekday"]    = df[dt_col].dt.weekday   # 0=Mon
    df["month"]      = df[dt_col].dt.month
    df["is_weekend"] = df["weekday"].isin([5,6]).astype(int)

    # Cyclic transforms
    df["hour_sin"]   = np.sin(2*np.pi*df["hour"]/24.0)
    df["hour_cos"]   = np.cos(2*np.pi*df["hour"]/24.0)
    df["wday_sin"]   = np.sin(2*np.pi*df["weekday"]/7.0)
    df["wday_cos"]   = np.cos(2*np.pi*df["weekday"]/7.0)
    df["month_sin"]  = np.sin(2*np.pi*df["month"]/12.0)
    df["month_cos"]  = np.cos(2*np.pi*df["month"]/12.0)

def build_distance_features(df, p_lat, p_lon, d_lat, d_lon):
    # Straight line distance (Haversine)
    df["haversine_km"] = haversine_km(df[p_lat], df[p_lon], df[d_lat], df[d_lon])
    # Manhattan-like proxy on sphere: project deltas
    # (not exact road distance, but useful signal)
    df["lat_diff"] = (df[d_lat] - df[p_lat]).abs()
    df["lon_diff"] = (df[d_lon] - df[p_lon]).abs()
    # Scale lon_diff roughly by latitude to reflect km scale at that latitude
    mean_lat_rad = np.radians((df[p_lat] + df[d_lat]) / 2.0)
    km_per_deg_lat = 110.574
    km_per_deg_lon = 111.320 * np.cos(mean_lat_rad)
    man_lat_km = df["lat_diff"] * km_per_deg_lat
    man_lon_km = df["lon_diff"] * km_per_deg_lon
    df["manhattan_km"] = man_lat_km + man_lon_km
    # Bearing (direction)
    df["bearing_deg"] = bearing_deg(df[p_lat], df[p_lon], df[d_lat], df[d_lon])

def basic_quality_flags(df, p_lat, p_lon, d_lat, d_lon):
    # Valid geographic ranges
    df["flag_invalid_pickup"]  = (~df[p_lat].between(-90, 90)) | (~df[p_lon].between(-180, 180))
    df["flag_invalid_dropoff"] = (~df[d_lat].between(-90, 90)) | (~df[d_lon].between(-180, 180))
    df["flag_zero_distance"]   = (df["haversine_km"] == 0).astype(int)

# ---------- LOAD ----------
train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

# ---------- IDENTIFY KEY COLUMNS ----------
id_col      = find_col(train.columns, CAND_ID) or find_col(test.columns, CAND_ID)
target_col  = find_col(train.columns, CAND_TARGET)
pickup_lat  = find_col(train.columns, CAND_PICKUP_LAT)
pickup_lon  = find_col(train.columns, CAND_PICKUP_LON)
dropoff_lat = find_col(train.columns, CAND_DROPOFF_LAT)
dropoff_lon = find_col(train.columns, CAND_DROPOFF_LON)

# Parse a datetime column (if present)
dt_col_train = to_datetime_col(train, CAND_DT)
dt_col_test  = to_datetime_col(test,  CAND_DT)

print("Detected -> id:", id_col, "| target:", target_col, 
      "| pickup_lat:", pickup_lat, "| pickup_lon:", pickup_lon, 
      "| dropoff_lat:", dropoff_lat, "| dropoff_lon:", dropoff_lon, 
      "| datetime(train/test):", dt_col_train, dt_col_test)

# ---------- FEATURE ENGINEERING ----------
# 1) Time features (if datetime exists)
if dt_col_train: build_time_features(train, dt_col_train)
if dt_col_test:  build_time_features(test,  dt_col_test)

# 2) Distance/geospatial features (if coords exist)
if all([pickup_lat, pickup_lon, dropoff_lat, dropoff_lon]):
    build_distance_features(train, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
    build_distance_features(test,  pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
    basic_quality_flags(train, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
    basic_quality_flags(test,  pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
else:
    # If no coords, try to use any existing 'distance_km' column as-is
    if "distance_km" not in train.columns:
        print("⚠️ No coordinate columns found; distance features skipped.")

# 3) Safe target prep (train only)
if target_col:
    # Convert minutes to seconds if name suggests, else leave; just ensure numeric
    train[target_col] = pd.to_numeric(train[target_col], errors="coerce")

    # Remove obviously invalid target rows (train only)
    # Keep a copy of raw target; also prepare log1p target for models that prefer it
    train["target_raw"] = train[target_col]

    # Non-positive durations are unusable
    mask_valid_t = train[target_col] > 0
    removed_nonpos = (~mask_valid_t).sum()
    train = train[mask_valid_t].copy()

    # Clip *extreme* outliers on the high side to stabilize models
    hi = train[target_col].quantile(0.995)  # 99.5th percentile
    train["target_clipped"] = clip_series(train[target_col], lo=None, hi=hi)
    train["target_log1p"]   = np.log1p(train["target_clipped"])

    print(f"Removed non-positive durations: {removed_nonpos}")
    print(f"Clipped high tail at 99.5%: {hi:.2f}")
else:
    print("⚠️ No target column detected in TRAIN — expected in train only; continuing.")

# 4) Minimal categorical handling (example: vendor_id if exists)
for cat_col in ["vendor_id","route_type","service_level"]:
    if cat_col in train.columns:
        train[cat_col] = train[cat_col].astype("category")
    if cat_col in test.columns:
        test[cat_col] = test[cat_col].astype("category")

# 5) Final column selection for modeling
#    Keep: ids (if exist), engineered features, and targets (train only)
keep_features = [
    # IDs
    id_col if id_col in (train.columns if id_col else []) else None,
    # Time
    "hour","weekday","month","is_weekend","hour_sin","hour_cos","wday_sin","wday_cos","month_sin","month_cos",
    # Geo
    "haversine_km","manhattan_km","bearing_deg","lat_diff","lon_diff",
    # Quality flags
    "flag_invalid_pickup","flag_invalid_dropoff","flag_zero_distance",
]
keep_features = [c for c in keep_features if c and c in train.columns]

# Add any categorical columns we coerced
for cat_col in ["vendor_id","route_type","service_level"]:
    if cat_col in train.columns:
        keep_features.append(cat_col)

# Add targets for train
train_targets = []
if "target_log1p" in train.columns: train_targets.append("target_log1p")
if target_col and target_col in train.columns: train_targets.append(target_col)

# Build final frames
train_fe = train[[c for c in keep_features if c in train.columns] + train_targets].copy()
test_fe_cols = [c for c in keep_features if c in test.columns]
test_fe  = test[test_fe_cols].copy()

# If ID exists but was not in keep_features, add it for merging/submission
if id_col and id_col in train.columns and id_col not in train_fe.columns:
    train_fe[id_col] = train[id_col]
if id_col and id_col in test.columns and id_col not in test_fe.columns:
    test_fe[id_col] = test[id_col]

# Reorder columns (ID first if present)
def reorder_with_id_first(df, id_col_name):
    if id_col_name and id_col_name in df.columns:
        cols = [id_col_name] + [c for c in df.columns if c != id_col_name]
        return df[cols]
    return df

train_fe = reorder_with_id_first(train_fe, id_col)
test_fe  = reorder_with_id_first(test_fe,  id_col)

# Save engineered datasets
train_fe_path = "train_fe.csv"
test_fe_path  = "test_fe.csv"
train_fe.to_csv(train_fe_path, index=False)
test_fe.to_csv(test_fe_path,  index=False)

print("✅ Feature engineering complete.")
print("Saved:", train_fe_path, "->", train_fe.shape, "|", test_fe_path, "->", test_fe.shape)

# Optional: quick preview
display(train_fe.head(10))
display(test_fe.head(10))


Detected -> id: id | target: trip_duration | pickup_lat: pickup_latitude | pickup_lon: pickup_longitude | dropoff_lat: dropoff_latitude | dropoff_lon: dropoff_longitude | datetime(train/test): pickup_datetime pickup_datetime
Removed non-positive durations: 0
Clipped high tail at 99.5%: 4139.00
✅ Feature engineering complete.
Saved: train_fe.csv -> (1458644, 22) | test_fe.csv -> (625134, 20)


Unnamed: 0,id,hour,weekday,month,is_weekend,hour_sin,hour_cos,wday_sin,wday_cos,month_sin,...,manhattan_km,bearing_deg,lat_diff,lon_diff,flag_invalid_pickup,flag_invalid_dropoff,flag_zero_distance,vendor_id,target_log1p,trip_duration
0,id2875421,17,0,3,0,-0.965926,-0.258819,0.0,1.0,1.0,...,1.73567,99.970196,0.002335,0.017525,False,False,0,2,6.122493,455
1,id2377394,0,6,6,1,0.0,1.0,-0.781831,0.62349,1.224647e-16,...,2.4278,242.846232,0.007412,0.019066,False,False,0,1,6.498282,663
2,id3858529,11,1,1,0,0.258819,-0.965926,0.781831,0.62349,0.5,...,8.173527,200.319835,0.053852,0.026306,False,False,0,2,7.661527,2124
3,id3504673,19,2,4,0,-0.965926,0.258819,0.974928,-0.222521,0.8660254,...,1.653332,187.2623,0.013252,0.002228,False,False,0,2,6.063785,429
4,id2181028,13,5,3,1,-0.258819,-0.965926,-0.974928,-0.222521,1.0,...,1.192833,179.473585,0.010689,0.00013,False,False,0,2,6.077642,435
5,id0801584,22,5,1,1,-0.5,0.866025,-0.974928,-0.222521,0.5,...,1.550674,315.004404,0.006989,0.009224,False,False,0,2,6.095825,443
6,id1813257,22,4,6,0,-0.5,0.866025,-0.433884,-0.900969,1.224647e-16,...,1.86994,47.505775,0.008057,0.011612,False,False,0,1,5.834811,341
7,id1324603,7,5,5,1,0.965926,-0.258819,-0.974928,-0.222521,0.5,...,8.061111,136.385396,0.03722,0.046806,False,False,0,2,7.3473,1551
8,id1301050,23,4,5,0,-0.258819,0.965926,-0.433884,-0.900969,0.5,...,1.772683,118.284067,0.005585,0.013695,False,False,0,1,5.545177,255
9,id0012891,21,3,3,0,-0.707107,0.707107,0.433884,-0.900969,1.0,...,5.726371,7.603538,0.04565,0.008049,False,False,0,2,7.111512,1225


Unnamed: 0,id,hour,weekday,month,is_weekend,hour_sin,hour_cos,wday_sin,wday_cos,month_sin,month_cos,haversine_km,manhattan_km,bearing_deg,lat_diff,lon_diff,flag_invalid_pickup,flag_invalid_dropoff,flag_zero_distance,vendor_id
0,id3004672,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,2.74643,2.898159,356.404776,0.024651,0.002045,False,False,0,1
1,id3505355,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,2.759243,3.090022,172.278835,0.02459,0.004395,False,False,0,1
2,id1217141,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,1.306157,1.842462,133.326248,0.00806,0.011276,False,False,0,1
3,id2150126,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,5.269095,7.141294,209.043167,0.041431,0.030357,False,False,0,2
4,id1598245,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,0.960843,1.351552,130.260381,0.005585,0.008705,False,False,0,1
5,id0668992,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,4.186278,4.973206,12.530495,0.036751,0.010788,False,False,0,1
6,id1765014,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,3.512619,4.927677,140.985013,0.024548,0.026237,False,False,0,1
7,id0898117,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,2.980958,4.201728,47.864234,0.017982,0.02623,False,False,0,1
8,id3905224,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,18.852197,25.835332,31.368065,0.144703,0.116714,False,False,0,2
9,id1543102,23,3,6,0,-0.258819,0.965926,0.433884,-0.900969,1.224647e-16,-1.0,1.820017,2.557361,39.930618,0.01255,0.01387,False,False,0,2


In [14]:
# ==========================================================
# STEP 3 — Modeling, Evaluation, Submission (final build)
# XGBoost via native API (no sklearn callbacks issues)
# Compatible with: xgboost 3.x, scikit-learn 1.6.x
# ==========================================================
import os, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# --------------------------
# (A) Settings & Paths
# --------------------------
BASE = os.getcwd()  # change to absolute path if needed, e.g. r"C:\New folder7"
train_fe_path = os.path.join(BASE, "train_fe.csv")
test_fe_path  = os.path.join(BASE, "test_fe.csv")
sample_path   = os.path.join(BASE, "sample_submission.csv")

FAST_DEBUG   = False# set to False for best accuracy on full data
N_ROWS_DEBUG = 200_000   # rows to sample in fast mode

print("CWD:", os.getcwd())
print("Expecting:", train_fe_path, "and", test_fe_path)

# --------------------------
# (B) Load engineered data
# --------------------------
train = pd.read_csv(train_fe_path)
test  = pd.read_csv(test_fe_path)
print("Loaded -> train:", train.shape, "| test:", test.shape)

# ----------------------------------
# (C) Detect ID and target columns
# ----------------------------------
CAND_ID     = ["id","trip_id","ride_id","row_id"]
CAND_TARGET = ["target_log1p","trip_duration","duration","travel_time",
               "period_of_trip","trip_time_min","trip_time_sec"]

def find_col(cols, candidates):
    low = {c.lower(): c for c in cols}
    for name in candidates:
        if name.lower() in low:
            return low[name.lower()]
    return None

id_col     = find_col(train.columns, CAND_ID) or find_col(test.columns, CAND_ID)
target_col = "target_log1p" if "target_log1p" in train.columns else find_col(train.columns, CAND_TARGET)
if target_col is None:
    raise ValueError("No target column found (expected 'target_log1p' or a duration column).")

use_log_target = (target_col == "target_log1p")
print(f"Detected -> id: {id_col} | target: {target_col} | use_log_target={use_log_target}")

# ---------------------------------------------------
# (D) Feature set (drop targets and non-feature cols)
# ---------------------------------------------------
drop_cols = {target_col, "target_raw", "target_clipped"}
if id_col: drop_cols.add(id_col)

feature_cols = [c for c in train.columns if c not in drop_cols and c in test.columns]
if not feature_cols:
    raise ValueError("No common feature columns between train and test after dropping targets/ID.")

# Optional sampling for speed
if FAST_DEBUG and len(train) > N_ROWS_DEBUG:
    train = train.sample(N_ROWS_DEBUG, random_state=42).reset_index(drop=True)
    print(f"FAST_DEBUG: sampled {len(train):,} rows")

X      = train[feature_cols].copy()
X_test = test[feature_cols].copy()
y      = train[target_col].values

# Dtype coercion: object -> numeric or category
for col in X.select_dtypes(include="object").columns:
    try:
        X[col]      = pd.to_numeric(X[col])
        X_test[col] = pd.to_numeric(X_test[col])
    except Exception:
        X[col]      = X[col].astype("category")
        X_test[col] = X_test[col].astype("category")

from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
print("Split -> X_tr:", X_tr.shape, " | X_val:", X_val.shape)

# ------------------------------------------------
# (E) Metrics helper (MAE/RMSE in original units)
# ------------------------------------------------
from sklearn.metrics import mean_absolute_error, mean_squared_error

def eval_scores(y_true_log, y_pred_log, use_log=True):
    if use_log:
        y_true = np.expm1(y_true_log)
        y_pred = np.expm1(y_pred_log)
    else:
        y_true = y_true_log
        y_pred = y_pred_log
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5

    return {"MAE": mae, "RMSE": rmse}

scores = {}
models  = {}

# --------------------------------------
# (F1) Random Forest — baseline
# --------------------------------------
from sklearn.ensemble import RandomForestRegressor
t0 = time.time()
rf = RandomForestRegressor(
    n_estimators=150 if FAST_DEBUG else 300,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
)
rf.fit(X_tr, y_tr)
rf_pred_val = rf.predict(X_val)
scores["RandomForest"] = eval_scores(y_val, rf_pred_val, use_log=use_log_target)
models["RandomForest"] = rf
print(f"RF: {scores['RandomForest']} | time={time.time()-t0:.1f}s")

# --------------------------------------
# (F2) LightGBM — gradient boosting
# --------------------------------------
try:
    import lightgbm as lgb
except Exception:
    print("Installing lightgbm...")
    import sys
    !{sys.executable} -m pip install -q lightgbm
    import lightgbm as lgb

t0 = time.time()
lgbm = lgb.LGBMRegressor(
    n_estimators=800 if FAST_DEBUG else 2000,
    learning_rate=0.10 if FAST_DEBUG else 0.05,
    num_leaves=48 if FAST_DEBUG else 64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="l2",
    callbacks=[lgb.early_stopping(50 if FAST_DEBUG else 100),
               lgb.log_evaluation(100 if FAST_DEBUG else 200)]
)
lgbm_pred_val = lgbm.predict(X_val, num_iteration=lgbm.best_iteration_)
scores["LightGBM"] = eval_scores(y_val, lgbm_pred_val, use_log=use_log_target)
models["LightGBM"] = lgbm
print(f"LGBM: {scores['LightGBM']} | best_iter={lgbm.best_iteration_} | time={time.time()-t0:.1f}s")

# --------------------------------------
# (F3) XGBoost — native API (xgb.train)
# --------------------------------------
import xgboost as xgb

# Convert splits to DMatrix
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval   = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.10 if FAST_DEBUG else 0.05,  # learning rate
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,                         # L2 reg
    "tree_method": "hist",                 # "gpu_hist" if you have a GPU
}

num_round = 800 if FAST_DEBUG else 2000
watchlist = [(dtrain, "train"), (dval, "eval")]

t0 = time.time()
bst = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=num_round,
    evals=watchlist,
    early_stopping_rounds=50 if FAST_DEBUG else 100,
    verbose_eval=100
)
best_iteration = bst.attributes().get('best_iteration')
xgb_pred_val = bst.predict(dval, iteration_range=(0, int(best_iteration) if best_iteration else 0))
scores["XGBoost"] = eval_scores(y_val, xgb_pred_val, use_log=use_log_target)
print(f"XGB: {scores['XGBoost']} | best_iter={best_iteration} | time={(time.time() - t0):.2f}s")



# --------------------------------------
# (G) Compare & simple ensemble
# --------------------------------------
print("\nValidation scores (lower is better):")
for k, v in scores.items():
    print(f"{k:12s} -> MAE: {v['MAE']:.4f} | RMSE: {v['RMSE']:.4f}")

# Simple ensemble (avg LGBM + XGB) — often best
if "LightGBM" in scores and "XGBoost" in scores:
    ens_pred_val = (lgbm_pred_val + xgb_pred_val) / 2.0
    scores["Ensemble_LGBM_XGB"] = eval_scores(y_val, ens_pred_val, use_log=use_log_target)
    print(f"Ensemble     -> MAE: {scores['Ensemble_LGBM_XGB']['MAE']:.4f} | RMSE: {scores['Ensemble_LGBM_XGB']['RMSE']:.4f}")

best_name = min(scores, key=lambda k: scores[k]["RMSE"])
print(f"\n🏆 Best model by RMSE: {best_name}")

# ------------------------------------------------------
# (H) Refit best on FULL train and predict on test
# ------------------------------------------------------
def refit_and_predict(name):
    if name == "RandomForest":
        m = RandomForestRegressor(
            n_estimators=150 if FAST_DEBUG else 300,
            max_depth=None, min_samples_leaf=2, n_jobs=-1, random_state=42
        ).fit(X, y)
        return m.predict(X_test)

    if name == "LightGBM":
        m = lgb.LGBMRegressor(
            n_estimators=int(lgbm.best_iteration_ or (800 if FAST_DEBUG else 2000)),
            learning_rate=0.10 if FAST_DEBUG else 0.05,
            num_leaves=48 if FAST_DEBUG else 64,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
        ).fit(X, y)
        return m.predict(X_test)

    if name == "XGBoost":
        # Refit using native API with best iteration from validation
        dtrain_full = xgb.DMatrix(X, label=y)
        dtest       = xgb.DMatrix(X_test)
        best_iter   = int(getattr(bst, "best_iteration", 800 if FAST_DEBUG else 2000))

        bst_full = xgb.train(
            params=xgb_params,             # reuse same params
            dtrain=dtrain_full,
            num_boost_round=best_iter,
            verbose_eval=False
        )
        return bst_full.predict(dtest)

    if name == "Ensemble_LGBM_XGB":
        # Fit both on full data and average predictions
        m1 = lgb.LGBMRegressor(
            n_estimators=int(lgbm.best_iteration_ or (800 if FAST_DEBUG else 2000)),
            learning_rate=0.10 if FAST_DEBUG else 0.05,
            num_leaves=48 if FAST_DEBUG else 64,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
        ).fit(X, y)

        dtrain_full = xgb.DMatrix(X, label=y)
        dtest       = xgb.DMatrix(X_test)
        best_iter   = int(getattr(bst, "best_iteration", 800 if FAST_DEBUG else 2000))
        bst_full    = xgb.train(params=xgb_params, dtrain=dtrain_full, num_boost_round=best_iter, verbose_eval=False)

        return (m1.predict(X_test) + bst_full.predict(dtest)) / 2.0

    raise ValueError("Unknown model name.")

pred_test = refit_and_predict(best_name)

# Inverse-transform if trained on log target
if use_log_target:
    pred_test = np.expm1(pred_test)

# Safety clip
pred_test = np.clip(pred_test, a_min=1e-6, a_max=None)

# -------------------------------------------
# (I) Build submission and save
# -------------------------------------------
if os.path.exists(sample_path):
    sample = pd.read_csv(sample_path)
    tgt_cols  = [c for c in sample.columns if c.lower() != "id"]
    sub_tgt   = tgt_cols[0] if tgt_cols else "trip_duration"
    if id_col and id_col in test.columns and "id" in sample.columns:
        out = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test})
        sub = sample.drop(columns=[sub_tgt], errors="ignore").merge(out, left_on="id", right_on=id_col, how="left")
        sub = sub[["id", sub_tgt]]
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test)), sub_tgt: pred_test})
else:
    sub_tgt = "trip_duration"
    if id_col and id_col in test.columns:
        sub = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test}).rename(columns={id_col: "id"})
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test)), sub_tgt: pred_test})

sub_path = os.path.join(BASE, "submission.csv")
sub.to_csv(sub_path, index=False)
print(f"\n✅ Done. Saved submission to: {sub_path}")
display(sub.head())


CWD: C:\New folder7
Expecting: C:\New folder7\train_fe.csv and C:\New folder7\test_fe.csv
Loaded -> train: (1458644, 22) | test: (625134, 20)
Detected -> id: id | target: target_log1p | use_log_target=True
Split -> X_tr: (1166915, 19)  | X_val: (291729, 19)
RF: {'MAE': 209.0416946675643, 'RMSE': 341.48042675457043} | time=3319.4s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.148876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 1166915, number of used features: 17
[LightGBM] [Info] Start training from score 6.461925
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l2: 0.158182
[400]	valid_0's l2: 0.157772
[600]	valid_0's l2: 0.157736
Early stopping, best iteration is:
[678]	valid_0's l2: 0.157686
LGBM: {'MAE': 203.72192715712453, 'RMSE': 336.7015529514449} | best_iter=678 | time=730.4s
[0]	train-rmse:0.749

KeyboardInterrupt: 

In [16]:
# ==========================================
# ENSEMBLE: XGBoost + LightGBM (validation -> full -> submission)
# ==========================================
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb

# ---- (A) تأكيد المتغيّرات الأساسية موجودة ----
needed = ["lgbm_pred_val","xgb_pred_val","y_val","use_log_target",
          "lgbm","bst","X","y","X_test","test","id_col","sample_path","BASE"]
missing = [n for n in needed if n not in globals()]
if missing:
    raise RuntimeError(f"Variables missing from session: {missing} — شغّل خلية التدريب أولًا.")

# ---- (B) اختيار وزن المزج على الـ validation ----
from sklearn.metrics import mean_absolute_error, mean_squared_error

def eval_scores_vec(y_true_log, y_pred_log, use_log=True):
    if use_log:
        y_true = np.expm1(y_true_log)
        y_pred = np.expm1(y_pred_log)
    else:
        y_true = y_true_log
        y_pred = y_pred_log
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    return mae, rmse

grid = np.linspace(0.0, 1.0, 21)  # وزن XGB من 0→1 بخطوة 0.05
best = {"w_xgb": None, "MAE": 1e18, "RMSE": 1e18}
rows = []
for w in grid:
    pred_val = w * xgb_pred_val + (1 - w) * lgbm_pred_val
    mae, rmse = eval_scores_vec(y_val, pred_val, use_log=use_log_target)
    rows.append((w, mae, rmse))
    if rmse < best["RMSE"]:
        best = {"w_xgb": float(w), "MAE": float(mae), "RMSE": float(rmse)}

res_df = pd.DataFrame(rows, columns=["w_xgb","MAE","RMSE"]).sort_values("RMSE")
print("Grid search over ensemble weight (validation):")
print(res_df.head(8).to_string(index=False))
print(f"\n🏆 Best weight for XGB on validation: w_xgb={best['w_xgb']:.2f} "
      f"→ MAE={best['MAE']:.2f}, RMSE={best['RMSE']:.2f}")

# ---- (C) إعادة تدريب النموذجين على كامل البيانات ----
# LightGBM full
lgb_n_estimators = int(getattr(lgbm, "best_iteration_", 0) or lgbm.get_params().get("n_estimators", 2000))
lgb_full = lgb.LGBMRegressor(**{
    **{k:v for k,v in lgbm.get_params().items() if k in
       ["learning_rate","num_leaves","subsample","colsample_bytree","random_state","n_jobs"]},
    "n_estimators": lgb_n_estimators
})
lgb_full.fit(X, y)
pred_lgb_test = lgb_full.predict(X_test, num_iteration=getattr(lgb_full, "best_iteration_", lgb_n_estimators))

# XGBoost full (native API) بنفس إعدادات bst وأفضل تكرار
FAST_DEBUG = globals().get("FAST_DEBUG", False)
eta = 0.10 if FAST_DEBUG else 0.05
xgb_params_full = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": eta,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "tree_method": "hist",  # غيّرها لـ "gpu_hist" لو عندك GPU
}
best_iter_attr = (bst.attributes().get("best_iteration") if hasattr(bst, "attributes") else None)
xgb_best_rounds = int(best_iter_attr) if best_iter_attr is not None else (800 if FAST_DEBUG else 2000)

dtrain_full = xgb.DMatrix(X, label=y)
dtest       = xgb.DMatrix(X_test)
bst_full = xgb.train(params=xgb_params_full, dtrain=dtrain_full, num_boost_round=xgb_best_rounds, verbose_eval=False)
pred_xgb_test = bst_full.predict(dtest)  # يستخدم كل الجولات المدربة

# ---- (D) المزج بالوزن الأفضل من الـ validation ----
w = best["w_xgb"]
pred_test_ens = w * pred_xgb_test + (1 - w) * pred_lgb_test

# عكس التحويل log1p إن وجد
if use_log_target:
    pred_test_ens = np.expm1(pred_test_ens)

pred_test_ens = np.clip(pred_test_ens, 1e-6, None)

# ---- (E) إنشاء ملف التسليم submission.csv ----
import os
if os.path.exists(sample_path):
    sample = pd.read_csv(sample_path)
    tgt_cols = [c for c in sample.columns if c.lower() != "id"]
    sub_tgt  = tgt_cols[0] if tgt_cols else "trip_duration"
    if id_col and id_col in test.columns and "id" in sample.columns:
        out = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens})
        sub = sample.drop(columns=[sub_tgt], errors="ignore").merge(out, left_on="id", right_on=id_col, how="left")
        sub = sub[["id", sub_tgt]]
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})
else:
    sub_tgt = "trip_duration"
    if id_col and id_col in test.columns:
        sub = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens}).rename(columns={id_col: "id"})
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})

sub_path = os.path.join(BASE, "submission_ensemble.csv")
sub.to_csv(sub_path, index=False)
print(f"\n✅ Ensemble submission saved to: {sub_path}")
try:
    display(sub.head())
except:
    print(sub.head())


Grid search over ensemble weight (validation):
 w_xgb        MAE       RMSE
  0.15 203.671870 336.675282
  0.10 203.683481 336.678161
  0.20 203.664443 336.678271
  0.05 203.700331 336.686916
  0.25 203.661532 336.687120
  0.00 203.721927 336.701553
  0.30 203.663498 336.701823
  0.35 203.670169 336.722372

🏆 Best weight for XGB on validation: w_xgb=0.15 → MAE=203.67, RMSE=336.68
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.242303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 1458644, number of used features: 17
[LightGBM] [Info] Start training from score 6.462139

✅ Ensemble submission saved to: C:\New folder7\submission_ensemble.csv


Unnamed: 0,id,trip_duration
0,id3004672,774.742354
1,id3505355,832.496301
2,id1217141,440.74645
3,id2150126,1029.464309
4,id1598245,337.084155


In [17]:
# ==========================================================
# Hyperparameter Tuning (LightGBM + XGBoost native) + Ensemble
# ==========================================================
import time, itertools, numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

def rmse_mae_from_logs(y_true_log, y_pred_log, use_log=True):
    if use_log:
        y_true = np.expm1(y_true_log); y_pred = np.expm1(y_pred_log)
    else:
        y_true = y_true_log; y_pred = y_pred_log
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae  = mean_absolute_error(y_true, y_pred)
    return rmse, mae

# ---------- (A) LightGBM tuning ----------
import lightgbm as lgb
ش
lgb_grid = {
    "learning_rate": [0.05, 0.075, 0.1],
    "num_leaves":    [48, 64, 96, 128],
    "min_child_samples": [20, 40, 80],
    "subsample":     [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
}

def lgb_eval(params):
    model = lgb.LGBMRegressor(
        n_estimators=3000,              # كبير + إيقاف مبكر
        learning_rate=params["learning_rate"],
        num_leaves=params["num_leaves"],
        min_child_samples=params["min_child_samples"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )
    callbacks = [lgb.early_stopping(200), lgb.log_evaluation(200)]
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric="l2", callbacks=callbacks)
    pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
    rmse, mae = rmse_mae_from_logs(y_val, pred_val, use_log=use_log_target)
    return rmse, mae, model

print("▶ Tuning LightGBM...")
t0 = time.time()
lgb_results = []
for vals in itertools.product(*lgb_grid.values()):
    params = dict(zip(lgb_grid.keys(), vals))
    rmse, mae, model = lgb_eval(params)
    lgb_results.append((rmse, mae, params, int(model.best_iteration_)))
    print(f"LGBM {params} -> RMSE={rmse:.3f} | MAE={mae:.3f} | best_iter={int(model.best_iteration_)}")

lgb_results.sort(key=lambda r: r[0])
best_lgb_rmse, best_lgb_mae, best_lgb_params, best_lgb_iter = lgb_results[0]
print(f"\n🏆 Best LGBM: RMSE={best_lgb_rmse:.3f}, MAE={best_lgb_mae:.3f}, params={best_lgb_params}, best_iter={best_lgb_iter}, time={(time.time()-t0):.1f}s")

# ---------- (B) XGBoost tuning (native API: xgb.train) ----------
import xgboost as xgb

xgb_grid = {
    "eta":        [0.03, 0.05, 0.07, 0.10],  # learning_rate
    "max_depth":  [6, 7, 8],
    "min_child_weight": [1, 3, 5],
    "subsample":  [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "lambda":     [1.0, 1.5, 2.0],
}

def xgb_eval(params):
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)
    p = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "eta": params["eta"],
        "max_depth": params["max_depth"],
        "min_child_weight": params["min_child_weight"],
        "subsample": params["subsample"],
        "colsample_bytree": params["colsample_bytree"],
        "lambda": params["lambda"],
        "tree_method": "hist"
    }
    watch = [(dtrain,"train"), (dval,"eval")]
    bst = xgb.train(p, dtrain, num_boost_round=4000, evals=watch,
                    early_stopping_rounds=200, verbose_eval=False)
    best_iter = int(bst.attributes().get("best_iteration") or bst.best_iteration if hasattr(bst,"best_iteration") else bst.num_boost_round)
    pred_val = bst.predict(dval, iteration_range=(0, best_iter))
    rmse, mae = rmse_mae_from_logs(y_val, pred_val, use_log=use_log_target)
    return rmse, mae, p, best_iter

print("\n▶ Tuning XGBoost...")
t0 = time.time()
xgb_results = []
# نجرب عدد محدود (عشوائي بسيط) بدل كل التركيبات لتقليل الزمن:
import random
random.seed(42)
all_combos = list(itertools.product(*xgb_grid.values()))
random.shuffle(all_combos)
sampled = all_combos[:40]   # عدّل الرقم لو تريد تجارب أكثر/أقل

for vals in sampled:
    params = dict(zip(xgb_grid.keys(), vals))
    rmse, mae, p, best_iter = xgb_eval(params)
    xgb_results.append((rmse, mae, p, best_iter))
    print(f"XGB {params} -> RMSE={rmse:.3f} | MAE={mae:.3f} | best_iter={best_iter}")

xgb_results.sort(key=lambda r: r[0])
best_xgb_rmse, best_xgb_mae, best_xgb_params, best_xgb_iter = xgb_results[0]
print(f"\n🏆 Best XGB: RMSE={best_xgb_rmse:.3f}, MAE={best_xgb_mae:.3f}, "
      f"params={best_xgb_params}, best_iter={best_xgb_iter}, time={(time.time()-t0):.1f}s")

# ---------- (C) Refit both on FULL data ----------
print("\n▶ Refit best params on FULL train...")

# LightGBM full
lgb_full = lgb.LGBMRegressor(
    n_estimators=best_lgb_iter,
    learning_rate=best_lgb_params["learning_rate"],
    num_leaves=best_lgb_params["num_leaves"],
    min_child_samples=best_lgb_params["min_child_samples"],
    subsample=best_lgb_params["subsample"],
    colsample_bytree=best_lgb_params["colsample_bytree"],
    random_state=42, n_jobs=-1
).fit(X, y)
pred_lgb_test = lgb_full.predict(X_test, num_iteration=best_lgb_iter)

# XGBoost full
dtrain_full = xgb.DMatrix(X, label=y)
dtest       = xgb.DMatrix(X_test)
bst_full = xgb.train(best_xgb_params, dtrain_full, num_boost_round=best_xgb_iter, verbose_eval=False)
pred_xgb_test = bst_full.predict(dtest, iteration_range=(0, best_xgb_iter))

# ---------- (D) Find best ensemble weight on validation again ----------
# نحسب تنبؤات validation لأفضل إعدادات (للحصول على الوزن الأمثل)
# LGBM val
lgb_val_model = lgb.LGBMRegressor(
    n_estimators=best_lgb_iter,
    learning_rate=best_lgb_params["learning_rate"],
    num_leaves=best_lgb_params["num_leaves"],
    min_child_samples=best_lgb_params["min_child_samples"],
    subsample=best_lgb_params["subsample"],
    colsample_bytree=best_lgb_params["colsample_bytree"],
    random_state=42, n_jobs=-1
)
lgb_val_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric="l2", callbacks=[lgb.early_stopping(50)])
pred_lgb_val = lgb_val_model.predict(X_val, num_iteration=lgb_val_model.best_iteration_)

# XGB val
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval   = xgb.DMatrix(X_val, label=y_val)
bst_val = xgb.train(best_xgb_params, dtrain, num_boost_round=best_xgb_iter, evals=[(dval,"eval")], verbose_eval=False)
pred_xgb_val = bst_val.predict(dval, iteration_range=(0, best_xgb_iter))

weights = np.linspace(0,1,21)
best_w = (None, 1e18, 1e18)
for w in weights:
    pv = w*pred_xgb_val + (1-w)*pred_lgb_val
    rmse, mae = rmse_mae_from_logs(y_val, pv, use_log=use_log_target)
    if rmse < best_w[1]:
        best_w = (w, rmse, mae)
print(f"\n🏁 Best ensemble weight on validation after tuning: w_xgb={best_w[0]:.2f} "
      f"→ RMSE={best_w[1]:.2f}, MAE={best_w[2]:.2f}")

# ---------- (E) Build tuned ensemble predictions on TEST ----------
pred_test_ens = best_w[0]*pred_xgb_test + (1-best_w[0])*pred_lgb_test
if use_log_target:
    pred_test_ens = np.expm1(pred_test_ens)
pred_test_ens = np.clip(pred_test_ens, 1e-6, None)

# ---------- (F) Save submission ----------
import os
if os.path.exists(sample_path):
    sample = pd.read_csv(sample_path)
    tgt_cols = [c for c in sample.columns if c.lower() != "id"]
    sub_tgt  = tgt_cols[0] if tgt_cols else "trip_duration"
    if id_col and id_col in test.columns and "id" in sample.columns:
        out = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens})
        sub = sample.drop(columns=[sub_tgt], errors="ignore").merge(out, left_on="id", right_on=id_col, how="left")
        sub = sub[["id", sub_tgt]]
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})
else:
    sub_tgt = "trip_duration"
    if id_col and id_col in test.columns:
        sub = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens}).rename(columns={id_col: "id"})
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})

sub_path = os.path.join(BASE, "submission_ensemble_tuned.csv")
sub.to_csv(sub_path, index=False)
print(f"\n✅ Tuned ensemble submission saved to: {sub_path}")
try:
    display(sub.head())
except:
    print(sub.head())


▶ Tuning LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.269345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 1166915, number of used features: 17
[LightGBM] [Info] Start training from score 6.461925
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 0.158381
[400]	valid_0's l2: 0.157898
[600]	valid_0's l2: 0.157701
Early stopping, best iteration is:
[568]	valid_0's l2: 0.157684
LGBM {'learning_rate': 0.05, 'num_leaves': 48, 'min_child_samples': 20, 'subsample': 0.7, 'colsample_bytree': 0.7} -> RMSE=337.857 | MAE=204.512 | best_iter=568
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.125291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 1166915

KeyboardInterrupt: 

In [19]:
# ==========================================
# Final Ensemble (XGBoost + LightGBM) with best weight
# ==========================================
import numpy as np, pandas as pd, os, time
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import xgboost as xgb

# --- تحقق من المتغيّرات الأساسية ---
need = ["X_tr","X_val","y_tr","y_val","X","y","X_test","test",
        "id_col","use_log_target","sample_path","BASE","lgbm","bst","xgb_params"]
missing = [n for n in need if n not in globals()]
if missing:
    raise RuntimeError(f"متغيّرات ناقصة في الجلسة: {missing}\nشغّل خلايا التدريب أولًا.")

def eval_rmse_mae(y_true_log, y_pred_log, use_log=True):
    if use_log:
        y_true = np.expm1(y_true_log); y_pred = np.expm1(y_pred_log)
    else:
        y_true = y_true_log; y_pred = y_pred_log
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae  = mean_absolute_error(y_true, y_pred)
    return rmse, mae

# ---------- (1) تنبؤات validation من النموذجين ----------
# LightGBM val
lgb_best_iter = int(getattr(lgbm, "best_iteration_", 0) or lgbm.get_params().get("n_estimators", 2000))
pred_lgb_val  = lgbm.predict(X_val, num_iteration=lgb_best_iter)

# XGBoost val (native API): استخرج best_iteration من الخصائص
best_iter_attr = (bst.attributes().get("best_iteration") if hasattr(bst, "attributes") else None)
xgb_best_iter  = int(best_iter_attr) if best_iter_attr is not None else 0
dval           = xgb.DMatrix(X_val, label=y_val)
pred_xgb_val   = bst.predict(dval, iteration_range=(0, xgb_best_iter)) if xgb_best_iter > 0 else bst.predict(dval)

# ---------- (2) البحث عن أفضل وزن على validation ----------
weights = np.linspace(0, 1, 21)  # 0.00, 0.05, ..., 1.00
best_w = None; best_rmse = 1e18; best_mae = 1e18
rows = []
for w in weights:
    mix = w*pred_xgb_val + (1-w)*pred_lgb_val
    rmse, mae = eval_rmse_mae(y_val, mix, use_log=use_log_target)
    rows.append((w, rmse, mae))
    if rmse < best_rmse:
        best_rmse, best_mae, best_w = rmse, mae, w

ens_tbl = pd.DataFrame(rows, columns=["w_xgb","RMSE","MAE"]).sort_values("RMSE")
print("Top ensemble weights on validation:")
print(ens_tbl.head(6).to_string(index=False))
print(f"\n🏆 Best weight: w_xgb={best_w:.2f} → RMSE={best_rmse:.2f}, MAE={best_mae:.2f}")

# ---------- (3) إعادة التدريب على كامل البيانات ----------
print("\nإعادة تدريب LightGBM على كامل البيانات...")
lgb_full = lgb.LGBMRegressor(
    n_estimators=lgb_best_iter,
    learning_rate=lgbm.get_params().get("learning_rate", 0.05),
    num_leaves=lgbm.get_params().get("num_leaves", 64),
    min_child_samples=lgbm.get_params().get("min_child_samples", 20),
    subsample=lgbm.get_params().get("subsample", 0.8),
    colsample_bytree=lgbm.get_params().get("colsample_bytree", 0.8),
    random_state=42, n_jobs=-1
).fit(X, y)
pred_lgb_test = lgb_full.predict(X_test, num_iteration=lgb_best_iter)

print("إعادة تدريب XGBoost (native) على كامل البيانات...")
dtrain_full = xgb.DMatrix(X, label=y)
dtest       = xgb.DMatrix(X_test)
# استخدم نفس الإعدادات التي درّبت بها bst من قبل
xgb_refit_rounds = xgb_best_iter if xgb_best_iter > 0 else xgb_params.get("num_boost_round", 2000)
bst_full = xgb.train(params=xgb_params, dtrain=dtrain_full, num_boost_round=int(xgb_refit_rounds), verbose_eval=False)
pred_xgb_test = bst_full.predict(dtest, iteration_range=(0, int(xgb_refit_rounds)))

# ---------- (4) المزج بالوزن الأفضل ----------
pred_test_ens = best_w*pred_xgb_test + (1-best_w)*pred_lgb_test
if use_log_target:
    pred_test_ens = np.expm1(pred_test_ens)
pred_test_ens = np.clip(pred_test_ens, 1e-6, None)

# ---------- (5) بناء ملف التسليم ----------
if os.path.exists(sample_path):
    sample = pd.read_csv(sample_path)
    tgt_cols = [c for c in sample.columns if c.lower() != "id"]
    sub_tgt  = tgt_cols[0] if tgt_cols else "trip_duration"
    if id_col and id_col in test.columns and "id" in sample.columns:
        out = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens})
        sub = sample.drop(columns=[sub_tgt], errors="ignore").merge(out, left_on="id", right_on=id_col, how="left")
        sub = sub[["id", sub_tgt]]
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})
else:
    sub_tgt = "trip_duration"
    if id_col and id_col in test.columns:
        sub = pd.DataFrame({id_col: test[id_col].values, sub_tgt: pred_test_ens}).rename(columns={id_col: "id"})
    else:
        sub = pd.DataFrame({"id": np.arange(len(pred_test_ens)), sub_tgt: pred_test_ens})

out_path = os.path.join(BASE, "submission_ensemble_final.csv")
sub.to_csv(out_path, index=False)
print(f"\n✅ Ensemble submission saved to: {out_path}")
try:
    display(sub.head())
except:
    print(sub.head())


Top ensemble weights on validation:
 w_xgb       RMSE        MAE
  0.15 336.675282 203.671870
  0.10 336.678161 203.683481
  0.20 336.678271 203.664443
  0.05 336.686916 203.700331
  0.25 336.687120 203.661532
  0.00 336.701553 203.721927

🏆 Best weight: w_xgb=0.15 → RMSE=336.68, MAE=203.67

إعادة تدريب LightGBM على كامل البيانات...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.221655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 1458644, number of used features: 17
[LightGBM] [Info] Start training from score 6.462139
إعادة تدريب XGBoost (native) على كامل البيانات...

✅ Ensemble submission saved to: C:\New folder7\submission_ensemble_final.csv


Unnamed: 0,id,trip_duration
0,id3004672,774.742326
1,id3505355,832.496301
2,id1217141,440.746434
3,id2150126,1029.46426
4,id1598245,337.084151


In [20]:
import pandas as pd

sub = pd.read_csv(r"C:\New folder7\submission_ensemble_final.csv")
print(sub.head())
print(sub.columns)


          id  trip_duration
0  id3004672     774.742326
1  id3505355     832.496301
2  id1217141     440.746434
3  id2150126    1029.464260
4  id1598245     337.084151
Index(['id', 'trip_duration'], dtype='object')


In [21]:
print(sub.isnull().sum())


id               0
trip_duration    0
dtype: int64


In [22]:
print(sub['trip_duration'].min(), sub['trip_duration'].max())


6.220870498656546 4428.192878671506


In [53]:
!pip install meteostat pytz lightgbm --quiet!



Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --quiet!


In [55]:
import os
import pandas as pd
import numpy as np
import pytz
from meteostat import Hourly, Point

# -------- إعداد المسارات --------
DATA_DIR = r"C:\New folder7"
TEST_PATH = os.path.join(DATA_DIR, "test.csv")                         # يجب أن يحتوي id, pickup_datetime
SUB_PATH  = os.path.join(DATA_DIR, "submission_ensemble_final.csv")    # يحتوي id, trip_duration (المتنبأ)
OUT_PATH  = os.path.join(DATA_DIR, "submission_with_weather.csv")      # الناتج

# -------- قراءة الملفات --------
test = pd.read_csv(TEST_PATH)
sub  = pd.read_csv(SUB_PATH)

assert "id" in test.columns and "pickup_datetime" in test.columns, "test.csv يجب أن يحوي الأعمدة: id, pickup_datetime"
assert "id" in sub.columns, "ملف التنبؤ يجب أن يحوي العمود id"

# -------- مناطق زمنية --------
NY_TZ  = pytz.timezone("America/New_York")
UTC_TZ = pytz.UTC

# -------- تجهيز وقت الالتقاط في test --------
test["pickup_datetime"] = pd.to_datetime(test["pickup_datetime"], errors="coerce")

# لو العمود بدون منطقة زمنية ⇒ نلصّق نيويورك، ثم نأخذ رأس الساعة
def to_ny_hourly(dt_series):
    s = dt_series.copy()
    if s.dt.tz is None:
        s = s.dt.tz_localize(NY_TZ, nonexistent="shift_forward", ambiguous="NaT")
    else:
        s = s.dt.tz_convert(NY_TZ)
    return s.dt.floor("H")

test["pickup_hour_ny"] = to_ny_hourly(test["pickup_datetime"])

# ===== نطاق الطقس الذي سنطلبه من meteostat (UTC) =====
# نشتق نفس الأوقات لكن بــ UTC (مع ضمان الوعي بالمنطقة الزمنية)
def to_utc_hourly(dt_series):
    s = dt_series.copy()
    if s.dt.tz is None:
        s = s.dt.tz_localize(NY_TZ, nonexistent="shift_forward", ambiguous="NaT")
    return s.dt.tz_convert(UTC_TZ).dt.floor("H")

pickup_hour_utc = to_utc_hourly(test["pickup_datetime"])
start_utc = pickup_hour_utc.min()
end_utc   = pickup_hour_utc.max()

# meteostat لا يقبل tz-aware في بعض المقارنات الداخلية ⇒ نحول إلى naive (بدون tzinfo)
start_utc_naive = start_utc.tz_localize(None).to_pydatetime()
end_utc_naive   = end_utc.tz_localize(None).to_pydatetime()

print("Fetching weather (UTC naive):", start_utc_naive, "→", end_utc_naive)

# ===== جلب الطقس =====
# نستخدم محطة Central Park لتمثيل طقس نيويورك
station = Point(40.7812, -73.9665)
wx = Hourly(station, start_utc_naive, end_utc_naive).fetch().reset_index()  # عمود time قد يكون naive أو aware

# --- ضمان أن عمود time «واعٍ» بــ UTC (لو كان naive نجعله UTC) ---
if wx["time"].dt.tz is None:
    wx["time"] = wx["time"].dt.tz_localize(UTC_TZ)

# --- تحويل الوقت إلى نيويورك وأخذ رأس الساعة للدمج ---
keep = ["time","temp","prcp","snow","wdir","wspd","pres","rhum"]
wx = wx[keep].copy()
wx["pickup_hour_ny"] = wx["time"].dt.tz_convert(NY_TZ).dt.floor("H")
wx.drop(columns=["time"], inplace=True)

print("Weather rows:", len(wx))

# ===== الدمج على ساعة الالتقاط =====
test_w = test.merge(wx, on="pickup_hour_ny", how="left")

# تعبئة فراغات بسيطة
for c in ["temp","prcp","snow","wdir","wspd","pres","rhum"]:
    if c in test_w.columns:
        test_w[c] = test_w[c].fillna(method="ffill").fillna(method="bfill")

# أعلام مبسطة (اختياري)
test_w["is_rain"] = (test_w["prcp"].fillna(0) > 0).astype(int)
test_w["is_snow"] = (test_w["snow"].fillna(0) > 0).astype(int)

# ===== دمج أعمدة الطقس مع ملف التنبؤ حسب id =====
sub_w = sub.merge(
    test_w[["id","temp","prcp","snow","wdir","wspd","pres","rhum","is_rain","is_snow"]],
    on="id", how="left"
)

# حفظ الناتج
sub_w.to_csv(OUT_PATH, index=False)
print(f"\n✅ Saved: {OUT_PATH}")
print(sub_w.head())


Fetching weather (UTC naive): 2016-01-01 05:00:00 → 2016-07-01 03:00:00
Weather rows: 4367

✅ Saved: C:\New folder7\submission_with_weather.csv
          id  trip_duration  temp  prcp  snow   wdir  wspd    pres  rhum  \
0  id3004672     774.742326  24.4   0.0  <NA>  160.0   9.4  1016.0  62.0   
1  id3505355     832.496301  24.4   0.0  <NA>  160.0   9.4  1016.0  62.0   
2  id1217141     440.746434  24.4   0.0  <NA>  160.0   9.4  1016.0  62.0   
3  id2150126    1029.464260  24.4   0.0  <NA>  160.0   9.4  1016.0  62.0   
4  id1598245     337.084151  24.4   0.0  <NA>  160.0   9.4  1016.0  62.0   

   is_rain  is_snow  
0        0        0  
1        0        0  
2        0        0  
3        0        0  
4        0        0  


In [62]:
pip install meteostat pytz lightgbm xgboost--quiet!


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: 'xgboost--quiet!': Expected end or semicolon (after name and no valid version specifier)
    xgboost--quiet!
                  ^


In [3]:
!pip install meteostat pytz lightgbm xgboost --quiet



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# ===================== الإعدادات والحزم =====================
import os, numpy as np, pandas as pd, pytz, warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from meteostat import Hourly, Point
import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings("ignore")

# مسارات الملفات (عدّل المسار عند الحاجة)
DATA_DIR   = r"C:\New folder7"
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")
SUB_OUT    = os.path.join(DATA_DIR, "submission_ensemble_weather.csv")

# المنطقة الزمنية ونقطة الطقس (سنترال بارك، نيويورك)
NY_TZ  = pytz.timezone("America/New_York")
UTC_TZ = pytz.UTC
STATION = Point(40.7812, -73.9665)

# ===================== دوال مساعدة =====================
def to_ny_hourly(dt_series):
    s = pd.to_datetime(dt_series, errors="coerce")
    s = s.dt.tz_localize(NY_TZ, nonexistent="shift_forward", ambiguous="NaT") if s.dt.tz is None else s.dt.tz_convert(NY_TZ)
    return s.dt.floor("H")

def to_utc_hourly(dt_series):
    s = pd.to_datetime(dt_series, errors="coerce")
    if s.dt.tz is None:
        s = s.dt.tz_localize(NY_TZ, nonexistent="shift_forward", ambiguous="NaT")
    return s.dt.tz_convert(UTC_TZ).dt.floor("H")

def fetch_weather_for_range(start_utc_hour, end_utc_hour):
    # meteostat يفضّل naive داخليًا
    start_naive = start_utc_hour.tz_localize(None).to_pydatetime()
    end_naive   = end_utc_hour.tz_localize(None).to_pydatetime()
    w = Hourly(STATION, start_naive, end_naive).fetch().reset_index()
    if w["time"].dt.tz is None:
        w["time"] = w["time"].dt.tz_localize(UTC_TZ)
    keep = ["time","temp","prcp","snow","wdir","wspd","pres","rhum"]
    w = w[keep].copy()
    w["pickup_hour_ny"] = w["time"].dt.tz_convert(NY_TZ).dt.floor("H")
    w.drop(columns=["time"], inplace=True)
    return w

def attach_weather(df, weather):
    out = df.copy()
    out["pickup_hour_ny"] = to_ny_hourly(out["pickup_datetime"])
    out = out.merge(weather, on="pickup_hour_ny", how="left")
    # تعبئة بسيطة
    for c in ["temp","prcp","snow","wdir","wspd","pres","rhum"]:
        if c in out.columns:
            out[c] = out[c].fillna(method="ffill").fillna(method="bfill")
    out["is_rain"] = (out["prcp"].fillna(0) > 0).astype(int)
    out["is_snow"] = (out["snow"].fillna(0) > 0).astype(int)
    return out

def eval_scores(y_true, y_pred, is_log=False):
    y_t = np.expm1(y_true) if is_log else y_true
    y_p = np.expm1(y_pred) if is_log else y_pred
    mae  = mean_absolute_error(y_t, y_p)
    rmse =mean_squared_error(y_true, y_pred) ** 0.5
    return mae, rmse

# ===================== 1) قراءة البيانات =====================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

assert "pickup_datetime" in train.columns and "pickup_datetime" in test.columns, "يجب وجود pickup_datetime في train/test"

target_col = "target_log1p" if "target_log1p" in train.columns else "trip_duration"
id_col = "id" if "id" in test.columns else None
use_log = (target_col == "target_log1p")

# ===================== 2) جلب الطقس ودمجه =====================
train_hour_utc = to_utc_hourly(train["pickup_datetime"])
test_hour_utc  = to_utc_hourly(test["pickup_datetime"])
start_utc = min(train_hour_utc.min(), test_hour_utc.min())
end_utc   = max(train_hour_utc.max(), test_hour_utc.max())

print("⛅ Fetching weather UTC:", start_utc, "→", end_utc)
weather = fetch_weather_for_range(start_utc, end_utc)
print("Weather shape:", weather.shape)

train_w = attach_weather(train, weather)
test_w  = attach_weather(test,  weather)

# ===================== 3) تقسيم train إلى train/val =====================
trn_df, val_df = train_test_split(train_w, test_size=0.2, random_state=42, shuffle=True)

drop_train_cols = [c for c in [target_col, id_col, "pickup_datetime", "pickup_hour_ny"] if c in trn_df.columns]
drop_val_cols   = [c for c in [target_col, id_col, "pickup_datetime", "pickup_hour_ny"] if c in val_df.columns]

X_tr = trn_df.drop(columns=drop_train_cols).select_dtypes(include=[np.number])
y_tr = trn_df[target_col].values
X_val = val_df.drop(columns=drop_val_cols).select_dtypes(include=[np.number])
y_val = val_df[target_col].values

# توحيد الأعمدة بين train/val
common_cols = sorted(set(X_tr.columns) & set(X_val.columns))
X_tr, X_val = X_tr[common_cols], X_val[common_cols]
print("Features used:", len(common_cols))

# ===================== 4) LightGBM مع إيقاف مبكر =====================
lgbm = lgb.LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.05,
    num_leaves=64,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="l2",
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)]
)
lgb_best_iter = int(getattr(lgbm, "best_iteration_", 0) or 5000)
pred_val_lgb = lgbm.predict(X_val, num_iteration=lgb_best_iter)
mae_lgb, rmse_lgb = eval_scores(y_val, pred_val_lgb, use_log)
print(f"\nLGBM  → MAE={mae_lgb:.2f} | RMSE={rmse_lgb:.2f} | best_iter={lgb_best_iter}")

# ===================== 5) XGBoost (واجهة أصلية xgboost.train) =====================
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dvalid = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    "tree_method": "hist",
    "seed": 42
}
watchlist = [(dtrain, "train"), (dvalid, "valid")]
bst = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=6000,
    evals=watchlist,
    early_stopping_rounds=200,
    verbose_eval=200
)

# التنبؤ على validation
best_iter = getattr(bst, "best_iteration", None)
best_ntree_limit = getattr(bst, "best_ntree_limit", None)
pred_val_xgb = bst.predict(dvalid, ntree_limit=best_ntree_limit) if best_ntree_limit else bst.predict(dvalid)

mae_xgb, rmse_xgb = eval_scores(y_val, pred_val_xgb, use_log)
print(f"\nXGB (native) → MAE={mae_xgb:.2f} | RMSE={rmse_xgb:.2f} | best_iter={best_iter}")

# ===================== 6) اختيار وزن التجميعة على validation =====================
ws = np.arange(0.05, 0.96, 0.05)  # وزن XGB
best_w = None
for w in ws:
    blend = w * pred_val_xgb + (1 - w) * pred_val_lgb
    mae_b, rmse_b = eval_scores(y_val, blend, use_log)
    if best_w is None or rmse_b < best_w[1]:
        best_w = (w, rmse_b, mae_b)
print(f"\n★ Best ensemble weight on val: w_xgb={best_w[0]:.2f} → RMSE={best_w[1]:.2f}, MAE={best_w[2]:.2f}")

# ===================== 7) تدريب نهائي على كامل train والتنبؤ على test =====================
drop_all_cols  = [c for c in [target_col, id_col, "pickup_datetime", "pickup_hour_ny"] if c in train_w.columns]
X_all = train_w.drop(columns=drop_all_cols).select_dtypes(include=[np.number])[common_cols]
y_all = train_w[target_col].values

# LightGBM كامل
lgbm_full = lgb.LGBMRegressor(
    n_estimators=lgb_best_iter if lgb_best_iter>0 else 3000,
    learning_rate=0.05, num_leaves=64, min_child_samples=20,
    subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
)
lgbm_full.fit(X_all, y_all)

# XGBoost كامل بنفس أفضل عدد جولات
best_rounds = int(best_iter) if isinstance(best_iter, int) and best_iter else 3000
dall  = xgb.DMatrix(X_all,  label=y_all)
bst_full = xgb.train(
    params=xgb_params,
    dtrain=dall,
    num_boost_round=best_rounds,
    verbose_eval=False
)

# تجهيز test
drop_test_cols = [c for c in [id_col, "pickup_datetime", "pickup_hour_ny"] if c in test_w.columns]
X_test = test_w.drop(columns=drop_test_cols).select_dtypes(include=[np.number])
X_test = X_test[common_cols]
dtest = xgb.DMatrix(X_test)

pred_lgb_t = lgbm_full.predict(X_test)
pred_xgb_t = bst_full.predict(dtest)

w = best_w[0]
pred_blend = w * pred_xgb_t + (1 - w) * pred_lgb_t
if use_log:
    pred_blend = np.expm1(pred_blend)

# ===================== 8) حفظ ملف التسليم =====================
sub = pd.DataFrame({
    "id": test[id_col] if id_col else np.arange(len(test)),
    "trip_duration": pred_blend
})
sub.to_csv(SUB_OUT, index=False)
print(f"\n✅ Saved ensemble submission: {SUB_OUT}")
print(sub.head())


⛅ Fetching weather UTC: 2016-01-01 05:00:00+00:00 → 2016-07-01 03:00:00+00:00
Weather shape: (4367, 8)
Features used: 15
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.800492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1611
[LightGBM] [Info] Number of data points in the train set: 1166915, number of used features: 13
[LightGBM] [Info] Start training from score 959.273585
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.11545e+07
Early stopping, best iteration is:
[12]	valid_0's l2: 1.04712e+07

LGBM  → MAE=536.06 | RMSE=3235.93 | best_iter=12
[0]	train-rmse:5562.34838	valid-rmse:3252.87813
[200]	train-rmse:3021.09939	valid-rmse:3525.78242
[203]	train-rmse:3018.78873	valid-rmse:3525.53109

XGB (native) → MAE=442.98 | RMSE=3526.50 | best_iter=4

★ Best ensemble weight on val: w_xgb=0.05 → RMSE=3234.