In [7]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# Install necessary packages
%pip install pandas
%pip install scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

# Use relative path with error handling
data_path = 'preprocessed_dataset.csv'
try:
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Dataset file not found at {data_path}")
    data = pd.read_csv(data_path)
    if len(data) == 0:
        raise ValueError("Dataset is empty")
    print(f"Successfully loaded {len(data)} records")
except Exception as e:
    print(f"Error loading data: {e}")
    raise

# Display the first few rows to verify it loaded correctly
print(data.head())

# Get some basic information about the data
print(data.info())

# Get summary statistics for numerical columns
print(data.describe())

# Check for missing values
print(data.isnull().sum())

data = data.dropna()

print("Column Names:", data.columns)
print("First 5 Rows:\n", data.head())

try:
    data['Date'] = pd.to_datetime(data['Date'])
    print("Date conversion successful!")  # If it gets this far
except KeyError as e:
    print(f"KeyError: {e}")
except Exception as e:
    print(f"Other error: {e}")

print("Code finished.")

# Remove rows where 'days_left' is "Unknown"
data = data[data['days_left'] != "Unknown"]

# Convert 'days_left' to integer
data['days_left'] = pd.to_numeric(data['days_left'], errors='coerce').astype('Int64')

# Convert 'date' column
data['date'] = pd.to_datetime(data['date'], format="%d-%m-%Y", errors='coerce')
data['date'] = data['date'].fillna(data['date'].median())  # Impute missing dates

# Extract date features
data['Day'] = data['date'].dt.day
data['Month'] = data['date'].dt.month
data['Year'] = data['date'].dt.year
data.drop(columns=['date'], inplace=True)

# Handle non-time values in 'departure_time' and 'arrival_time'
time_mappings = {
    "Evening": "18:00",
    "Morning": "08:00",
    "Night": "22:00",
    "Early_Morning": "05:00"
}
data['departure_time'] = data['departure_time'].replace(time_mappings)
data['arrival_time'] = data['arrival_time'].replace(time_mappings)

# Convert time columns to hours
data['departure_time_hour'] = pd.to_datetime(data['departure_time'], format='%H:%M', errors='coerce').dt.hour
data['arrival_time_hour'] = pd.to_datetime(data['arrival_time'], format='%H:%M', errors='coerce').dt.hour

# Drop original time columns
data.dropna(subset=['departure_time_hour', 'arrival_time_hour'], inplace=True)
data.drop(columns=['departure_time', 'arrival_time'], inplace=True)

# One-hot encode categorical features
categorical_columns = ['source_city', 'destination_city', 'class', 'airline']
data = pd.get_dummies(data, columns=categorical_columns)

# Convert all columns that should be numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Feature Scaling
scaler = StandardScaler()

# Identify numerical columns (excluding target variable 'price')
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove('price')

# Apply scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Prepare Data for Training
X = data.drop(columns=['price'])
y = data['price']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Example Prediction
new_flight = {
    'days_left': 5,
    'duration': 2,
    'stops': 0,
    'departure_time_hour': 18,
    'arrival_time_hour': 18,
    'Day': 15,
    'Month': 7,
    'Year': 2024,
    'source_city_Delhi': 0,
    'source_city_Mumbai': 1,
    'source_city_Bangalore': 0,
    'source_city_Chennai': 0,
    'destination_city_Delhi': 0,
    'destination_city_Mumbai': 1,
    'destination_city_Bangalore': 0,
    'destination_city_Chennai': 0,
    'class_Economy': 1,
    'class_Business': 0,
    'airline_AirIndia': 1,
    'airline_Indigo': 0,
    'airline_SpiceJet': 0
}

# Convert to DataFrame
new_flight_df = pd.DataFrame([new_flight])
new_flight_df = new_flight_df.reindex(columns=X.columns, fill_value=0)

# Scale numerical features
new_flight_df[numerical_cols] = scaler.transform(new_flight_df[numerical_cols])

# Predict Price
predicted_price = model.predict(new_flight_df)[0]
print(f"Predicted Price: {predicted_price}")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Latitude/OneDrive/Attachments/Desktop/PROJECT @KILL/preprocessed_dataset.csv'