In [None]:
#1 Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#2 Load the dataset
df = pd.read_csv('uber.csv')
df


#3 Check for missing values
print(df.isnull().sum())
# # Drop rows with missing values (or you can impute them)
df = df.dropna()

# 4extract datetime and then drop it
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# Extract features from datetime
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek
# Drop irrelevant columns
df = df.drop(['pickup_datetime', 'key'], axis=1)


#3 outliers
# show boxplot for outliers
sns.boxplot(x=df['fare_amount'])
plt.show()

# 2. Identify outliers using z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df[['fare_amount', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']]))
df = df[(z_scores < 3).all(axis=1)]

# show boxplot again
sns.boxplot(x=df['fare_amount'])
plt.show()


# 4. Check correlation
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# 5 define haversine func and convert pickup longitude and lat, etc to distance
def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # Radius of the Earth in km
    lon1, lon2 = np.radians(lon1), np.radians(lon2)
    lat1, lat2 = np.radians(lat1), np.radians(lat2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat/2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

df['distance'] = haversine(df['pickup_longitude'], df['pickup_latitude'],
                           df['dropoff_longitude'], df['dropoff_latitude'])


#6 split and scale data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features
X = df.drop(columns=['fare_amount'], axis=1)
# Target
y = df['fare_amount']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Step 3: Scale the test features using the same scaler
X_test_scaled = scaler.transform(X_test)


#7 train liner model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
lr = LinearRegression()
# Train the model
lr.fit(X_train_scaled, y_train)
# Predict
y_pred_lr = lr.predict(X_test_scaled)


from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f'{model_name} Performance:')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(f'R-Squared (R2): {r2:.2f}\n')

# Evaluation for Linear Regression
evaluate_model(y_test, y_pred_lr, 'Linear Regression')

# Evaluation for Random Forest
evaluate_model(y_test, y_pred_rf, 'Random Forest')


# Plotting predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, label='Random Forest', alpha=0.5)
plt.scatter(y_test, y_pred_lr, label='Linear Regression', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.legend()
plt.title('Actual vs Predicted Fare Amount')
plt.xlabel('Actual Fare Amount')
plt.ylabel('Predicted Fare Amount')
plt.show()



# Function to predict price for new locations
def predict_uber_price(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, passengers, pickuptime):
    """Predict Uber price for given pickup and dropoff locations"""
    # Calculate distance
    distance = haversine(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat)
    pickup_datetime = pd.to_datetime(pickuptime)

# Extract features from datetime
    pickup_hour= pickup_datetime.hour
    pickup_day = pickup_datetime.day
    pickup_month = pickup_datetime.month
    pickup_dayofweek = pickup_datetime.dayofweek


    # Prepare input
    X_pred = np.array([[distance, passengers, pickup_hour,pickup_month, pickup_dayofweek]])
    
    # Get predictions
    lr_price = lr.predict(X_pred)[0]
    rf_price = rf.predict(X_pred)[0]
    
    print("\nPrediction Results:")
    print(f"Distance: {distance:.2f} km")
    print(f"Linear Regression Prediction: ${lr_price:.2f}")
    print(f"Random Forest Prediction: ${rf_price:.2f}")
    
    return lr_price, rf_price

# Example prediction

print("\n6. Example Prediction:")
# pickup_lat = float(input("Enter pickup latitude (e.g., 40.7614): "))
# pickup_lon = float(input("Enter pickup longitude (e.g., -73.9776): "))
# dropoff_lat = float(input("Enter dropoff latitude (e.g., 40.7506): "))
# dropoff_lon = float(input("Enter dropoff longitude (e.g., -73.9936): "))
# passengers = int(input("Enter number of passengers: "))
    

sample_prices = predict_uber_price(
    pickup_lat=40.7614,
    pickup_lon=-73.99981689,
    dropoff_lat=40.73835373,
    dropoff_lon=-73.99951172,
    passengers=1,
    pickuptime= "2023-11-11 15:30:00"
)
# print("\nPredicted prices for sample ride:")
# for key, value in sample_prediction.items():
#     print(f"{key}: {value}")