In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv('nyc_taxi_data.csv')


In [None]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2.0,2016-02-29 16:40:21,2016-02-29 16:47:01,1.0,-73.953918,40.778873,-73.963875,40.771164,N,400.0
1,id0889885,1.0,2016-03-11 23:35:37,2016-03-11 23:53:57,2.0,-73.988312,40.731743,-73.994751,40.694931,N,1100.0
2,id0857912,2.0,2016-02-21 17:59:33,2016-02-21 18:26:48,2.0,-73.997314,40.721458,-73.948029,40.774918,N,1635.0
3,id3744273,2.0,2016-01-05 09:44:31,2016-01-05 10:03:32,6.0,-73.96167,40.75972,-73.956779,40.780628,N,1141.0
4,id0232939,1.0,2016-02-17 06:42:23,2016-02-17 06:56:31,1.0,-74.01712,40.708469,-73.988182,40.740631,N,848.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357942 entries, 0 to 357941
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  357942 non-null  object 
 1   vendor_id           357942 non-null  float64
 2   pickup_datetime     357942 non-null  object 
 3   dropoff_datetime    357942 non-null  object 
 4   passenger_count     357942 non-null  float64
 5   pickup_longitude    357942 non-null  float64
 6   pickup_latitude     357942 non-null  float64
 7   dropoff_longitude   357942 non-null  float64
 8   dropoff_latitude    357942 non-null  float64
 9   store_and_fwd_flag  357942 non-null  object 
 10  trip_duration       357942 non-null  float64
dtypes: float64(7), object(4)
memory usage: 30.0+ MB


In [None]:
import pandas as pd
import numpy as np


rows_to_keep = int(len(df) * 0.25)

reduced_df = df.sample(n=rows_to_keep, random_state=42)

reduced_df = reduced_df.reset_index(drop=True)

print(reduced_df.info())

reduced_df.to_csv('nyc_taxi_data_new.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89485 entries, 0 to 89484
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  89485 non-null  object 
 1   vendor_id           89485 non-null  float64
 2   pickup_datetime     89485 non-null  object 
 3   dropoff_datetime    89485 non-null  object 
 4   passenger_count     89485 non-null  float64
 5   pickup_longitude    89485 non-null  float64
 6   pickup_latitude     89485 non-null  float64
 7   dropoff_longitude   89485 non-null  float64
 8   dropoff_latitude    89485 non-null  float64
 9   store_and_fwd_flag  89485 non-null  object 
 10  trip_duration       89485 non-null  float64
dtypes: float64(7), object(4)
memory usage: 7.5+ MB
None


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from math import radians, cos, sin, asin, sqrt
from datetime import datetime

# Load the dataset
df = pd.read_csv('nyc_taxi_data_new.csv')

# Convert pickup and dropoff datetime to datetime objects
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

# Extract temporal features
df['pickup_day_of_week'] = df['pickup_datetime'].dt.dayofweek
df['pickup_hour'] = df['pickup_datetime'].dt.hour

# Calculate Haversine distance
def haversine(lon1, lat1, lon2, lat2):
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_longitude'], row['pickup_latitude'],
                                                          row['dropoff_longitude'], row['dropoff_latitude']), axis=1)

# Drop unnecessary columns
df = df.drop(columns=['id', 'pickup_datetime', 'dropoff_datetime', 'store_and_fwd_flag'])

# Handle missing values if any
df = df.dropna()

# Split data into features and target variable
X = df.drop(columns=['trip_duration'])
y = df['trip_duration']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)





In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

# Train Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Train Gradient Boosting Regression
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

# Save the models
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(gb_model, 'gradient_boosting_model.pkl')


['gradient_boosting_model.pkl']

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Prepare data for GAN
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define GAN components
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Hyperparameters
input_dim = X_train.shape[1]
output_dim = 1
lr = 0.0002
num_epochs = 200

# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(input_dim + output_dim)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

# Loss function
criterion = nn.BCELoss()

# Training loop
for epoch in range(num_epochs):
    for real_data, real_labels in train_loader:
        batch_size = real_data.size(0)

        # Real data
        real_labels = real_labels.view(-1, 1)
        real_input = torch.cat((real_data, real_labels), dim=1)
        real_target = torch.ones(batch_size, 1)

        # Fake data
        noise = torch.randn(batch_size, input_dim)
        fake_labels = generator(noise).detach()
        fake_input = torch.cat((noise, fake_labels), dim=1)
        fake_target = torch.zeros(batch_size, 1)

        # Train Discriminator
        optimizer_D.zero_grad()
        real_loss = criterion(discriminator(real_input), real_target)
        fake_loss = criterion(discriminator(fake_input), fake_target)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        generated_labels = generator(noise)
        g_input = torch.cat((noise, generated_labels), dim=1)
        g_loss = criterion(discriminator(g_input), real_target)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, D Loss: {d_loss.item()}, G Loss: {g_loss.item()}')

# Save the GAN models
torch.save(generator.state_dict(), 'gan_generator.pth')
torch.save(discriminator.state_dict(), 'gan_discriminator.pth')

# Generate synthetic data
with torch.no_grad():
    synthetic_noise = torch.randn(X_train.shape[0], input_dim)
    synthetic_labels = generator(synthetic_noise).numpy()
    synthetic_data = np.hstack((synthetic_noise.numpy(), synthetic_labels))

# Augment original data with synthetic data
augmented_X_train = np.vstack((X_train, synthetic_data[:, :-1]))
augmented_y_train = np.hstack((y_train, synthetic_data[:, -1]))


Epoch 0, D Loss: 100.0, G Loss: 0.0
Epoch 10, D Loss: 100.0, G Loss: 0.0
Epoch 20, D Loss: 100.0, G Loss: 0.0
Epoch 30, D Loss: 100.0, G Loss: 0.0
Epoch 40, D Loss: 100.0, G Loss: 0.0
Epoch 50, D Loss: 100.0, G Loss: 0.0
Epoch 60, D Loss: 100.0, G Loss: 0.0
Epoch 70, D Loss: 100.0, G Loss: 0.0
Epoch 80, D Loss: 100.0, G Loss: 0.0
Epoch 90, D Loss: 100.0, G Loss: 0.0
Epoch 100, D Loss: 100.0, G Loss: 0.0
Epoch 110, D Loss: 100.0, G Loss: 0.0
Epoch 120, D Loss: 100.0, G Loss: 0.0
Epoch 130, D Loss: 100.0, G Loss: 0.0
Epoch 140, D Loss: 100.0, G Loss: 0.0
Epoch 150, D Loss: 100.0, G Loss: 0.0
Epoch 160, D Loss: 100.0, G Loss: 0.0
Epoch 170, D Loss: 100.0, G Loss: 0.0
Epoch 180, D Loss: 100.0, G Loss: 0.0
Epoch 190, D Loss: 100.0, G Loss: 0.0


In [None]:
# Re-train models with augmented data
aug_rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
aug_rf_model.fit(augmented_X_train, augmented_y_train)

aug_gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
aug_gb_model.fit(augmented_X_train, augmented_y_train)

# Evaluate models on test data
rf_preds = rf_model.predict(X_test)
aug_rf_preds = aug_rf_model.predict(X_test)

gb_preds = gb_model.predict(X_test)
aug_gb_preds = aug_gb_model.predict(X_test)

print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, rf_preds)))
print("Random Forest MAE:", mean_absolute_error(y_test, rf_preds))

print("Augmented Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, aug_rf_preds)))
print("Augmented Random Forest MAE:", mean_absolute_error(y_test, aug_rf_preds))

print("Gradient Boosting RMSE:", np.sqrt(mean_squared_error(y_test, gb_preds)))
print("Gradient Boosting MAE:", mean_absolute_error(y_test, gb_preds))

print("Augmented Gradient Boosting RMSE:", np.sqrt(mean_squared_error(y_test, aug_gb_preds)))
print("Augmented Gradient Boosting MAE:", mean_absolute_error(y_test, aug_gb_preds))


Random Forest RMSE: 3321.537292269749
Random Forest MAE: 436.8600738304324
Augmented Random Forest RMSE: 3297.732187650002
Augmented Random Forest MAE: 432.1356295794523
Gradient Boosting RMSE: 3346.478895037519
Gradient Boosting MAE: 434.9024194471636
Augmented Gradient Boosting RMSE: 3314.4142261292677
Augmented Gradient Boosting MAE: 429.877135057753
