In [6]:
'''import pandas as pd
from sklearn.cluster import KMeans
import heapq
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load your dataframe - Replace 'your_data.csv' with your actual file
df = pd.read_csv('train.csv')

# 1. Clustering based on Latitude and Longitude
def perform_clustering(df, n_clusters=5):
    """Clusters data based on source and destination coordinates."""
    coordinates = df[['source_lat', 'source_long', 'destination_lat', 'destination_long']].values
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)  # Explicitly set n_init
    df['cluster'] = kmeans.fit_predict(coordinates)
    return df

df = perform_clustering(df)

# 2. A* Pathfinding (Simplified - using Haversine distance)
def haversine(lat1, lon1, lat2, lon2):
    """Calculates the Haversine distance between two points on Earth."""
    R = 6371  # Radius of Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

def a_star(df, start_city, end_city, cluster_id):
    """Finds a path between two cities within a cluster using A*."""
    cluster_data = df[df['cluster'] == cluster_id]
    
    # Create a graph of cities within the cluster
    graph = {}
    for index, row in cluster_data.iterrows():
        city_name = row['city_x']  # Or use a combined city name if needed
        graph[city_name] = {
            'lat': row['source_lat'],
            'lon': row['source_long'],
            'neighbors': {}
        }
    
    # Populate neighbors (simplified - connect to all other cities in the cluster)
    for city1 in graph:
        for city2 in graph:
            if city1 != city2:
                lat1 = graph[city1]['lat']
                lon1 = graph[city1]['lon']
                lat2 = graph[city2]['lat']
                lon2 = graph[city2]['lon']
                distance = haversine(lat1, lon1, lat2, lon2)
                graph[city1]['neighbors'][city2] = distance
    
    # A* implementation
    open_set = [(0, start_city)]  # (f_score, city)
    came_from = {}
    g_score = {city: float('inf') for city in graph}
    g_score[start_city] = 0
    f_score = {city: float('inf') for city in graph}
    f_score[start_city] = haversine(graph[start_city]['lat'], graph[start_city]['lon'],
                                     graph[end_city]['lat'], graph[end_city]['lon'])
    
    while open_set:
        f, current_city = heapq.heappop(open_set)
        
        if current_city == end_city:
            path = []
            while current_city in came_from:
                path.append(current_city)
                current_city = came_from[current_city]
            path.append(start_city)
            path.reverse()
            return path
        
        for neighbor, distance in graph[current_city]['neighbors'].items():
            temp_g_score = g_score[current_city] + distance
            if temp_g_score < g_score[neighbor]:
                came_from[neighbor] = current_city
                g_score[neighbor] = temp_g_score
                f_score[neighbor] = temp_g_score + haversine(graph[neighbor]['lat'], graph[neighbor]['lon'],
                                                             graph[end_city]['lat'], graph[end_city]['lon'])
                heapq.heappush(open_set, (f_score[neighbor], neighbor))
    
    return None  # No path found

# Example usage of A*
cluster_id = 0  # Select a cluster
start_city = df[df['cluster'] == cluster_id]['city_x'].iloc[0]  # Example start city
end_city = df[df['cluster'] == cluster_id]['city_x'].iloc[100]    # Example end city
path = a_star(df, start_city, end_city, cluster_id)

if path:
    print(f"A* Path from {start_city} to {end_city} (Cluster {cluster_id}): {path}")
else:
    print(f"No path found from {start_city} to {end_city} (Cluster {cluster_id})")

# 3. XGBoost Model for Time and Distance Estimation
def train_xgboost_model(df, features, target):
    """Trains an XGBoost model for a given target variable."""
    X = df[features]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror',  # Specify objective function
                                 n_estimators=100,  # Number of boosting rounds
                                 learning_rate=0.1,
                                 max_depth=5,
                                 random_state=42)
    
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"XGBoost RMSE ({target}): {rmse:.4f}")
    
    return xgb_model

# Define features for XGBoost
features = ['osrm_time', 'start_scan_to_end_scan', 'osrm_distance', 'segment_actual_time', 'segment_osrm_time', 'segment_osrm_distance', 'source_lat', 'source_long', 'destination_lat', 'destination_long']

# Train models for actual_time and actual_distance_to_destination
time_model = train_xgboost_model(df, features, 'actual_time')
distance_model = train_xgboost_model(df, features, 'actual_distance_to_destination')

# Example Prediction
example_data = df[features].iloc[[0]]  # Use the first row as an example
predicted_time = time_model.predict(example_data)[0]
predicted_distance = distance_model.predict(example_data)[0]

print(f"Predicted Time: {predicted_time:.2f}")
print(f"Predicted Distance: {predicted_distance:.2f}")
'''

'import pandas as pd\nfrom sklearn.cluster import KMeans\nimport heapq\nimport xgboost as xgb\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nimport numpy as np\n\n# Load your dataframe - Replace \'your_data.csv\' with your actual file\ndf = pd.read_csv(\'train.csv\')\n\n# 1. Clustering based on Latitude and Longitude\ndef perform_clustering(df, n_clusters=5):\n    """Clusters data based on source and destination coordinates."""\n    coordinates = df[[\'source_lat\', \'source_long\', \'destination_lat\', \'destination_long\']].values\n    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)  # Explicitly set n_init\n    df[\'cluster\'] = kmeans.fit_predict(coordinates)\n    return df\n\ndf = perform_clustering(df)\n\n# 2. A* Pathfinding (Simplified - using Haversine distance)\ndef haversine(lat1, lon1, lat2, lon2):\n    """Calculates the Haversine distance between two points on Earth."""\n    R = 6371  # Radius of Ear

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import euclidean
from xgboost import XGBRegressor
import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score

ddf = pd.read_csv("nR.csv")
#ddf = ddf.drop(['data'], axis=1)
df = ddf[ddf['data'] == 'training']
#df.isnull().sum() #verifying that the data is consistent
ddf#df.describe()

Unnamed: 0,data,source_name,destination_name,od_start_time,od_end_time,start_scan_to_end_scan,actual_distance_to_destination,actual_time,osrm_time,osrm_distance,...,postcode_y,city_x,city_y,county_x,county_y,Sstate,Dstate,loc_formatted_x,loc_formatted_y,haversine_distance
0,training,Anand_VUNagar_DC (Gujarat),Khambhat_MotvdDPP_D (Gujarat),2018-09-20 03:21:32.418600,2018-09-20 04:47:45.236797,86.0,10.435660,14.0,11.0,11.9653,...,388600.0,Anand,Khambhat,Anand City Taluka,Khambhat Taluka,Gujarat,Gujarat,"Anand, GJ, India","Khambhat, GJ, India",43.973301
1,training,Khambhat_MotvdDPP_D (Gujarat),Anand_Vaghasi_IP (Gujarat),2018-09-20 04:47:45.236797,2018-09-20 06:36:55.627764,109.0,10.403038,15.0,11.0,12.1171,...,388320.0,Khambhat,Vaghasi,Khambhat Taluka,Anand Rural Taluka,Gujarat,Gujarat,"Khambhat, GJ, India","Vaghasi, GJ, India",46.440528
2,training,Bhiwandi_Mankoli_HB (Maharashtra),Pune_Tathawde_H (Maharashtra),2018-09-23 06:42:06.021680,2018-09-23 11:44:28.365845,302.0,23.194334,38.0,24.0,26.8622,...,400603.0,Mankoli,Thane,Bhiwandi Taluka,Thane,Maharashtra,Maharashtra,"Mankoli, MH, India","Maharashtra Bank, Thane - 400603, MH, India",8.579809
3,training,LowerParel_CP (Maharashtra),Mumbai_Chndivli_PC (Maharashtra),2018-09-14 15:42:46.437249,2018-09-14 17:31:45.368791,108.0,9.355852,46.0,11.0,11.4344,...,,Thane,Mumbai,Thane,,Maharashtra,Maharashtra,"Maharashtra Bank, Thane - 400603, MH, India","Mumbai, MH, India",19.120215
4,training,Bangalore_Nelmngla_H (Karnataka),Bengaluru_Bomsndra_HB (Karnataka),2018-09-13 20:44:19.424489,2018-09-13 23:59:56.061158,195.0,23.635811,30.0,30.0,28.9765,...,560001.0,Bengaluru,Bengaluru,Bangalore North,Bangalore North,Karnataka,Karnataka,"Bengaluru, KA, India","Bengaluru, KA, India",0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,training,Noida_Surajpur_DC (Uttar Pradesh),Noida_Sec-83_DC (Uttar Pradesh),2018-09-24 02:42:45.423895,2018-09-24 06:18:55.981290,216.0,10.677769,32.0,15.0,13.3895,...,201301.0,Noida,Noida,Gautam Buddha Nagar,Gautam Buddha Nagar,Uttar Pradesh,Uttar Pradesh,"Dadri (Noida), UP, India","B1, Sec-8, Noida - 201301, UP, India",2.322455
2770,test,Kaghaznagar_Central_DPP_1 (Telangana),Karimnagar_KamnHbRD_I (Telangana),2018-09-30 04:32:44.757914,2018-09-30 08:55:48.335278,263.0,22.180996,38.0,18.0,23.3351,...,,Kaghaznagar,Telangana,Kagaznagar mandal,Warangal,Telangana,Telangana,"Kaghaznagar, TG, India","Telangana, WA, India",150.692864
2771,training,Chandigarh_Mehmdpur_H (Punjab),Naraingarh_Ward2DPP_D (Haryana),2018-09-12 01:06:44.904707,2018-09-12 03:55:15.023521,168.0,9.920661,16.0,9.0,11.6130,...,,Sehke,Naraingarh,Sangrur,Ambala,Punjab,Haryana,"Chandigarh, Sehke, SR, India","Naraingarh, AM, India",110.392131
2772,training,Mumbai_Sanpada_I (Maharashtra),Mumbai_Ghansoli_DC (Maharashtra),2018-09-26 04:06:47.129213,2018-09-26 12:41:37.272375,514.0,9.993929,34.0,12.0,15.5114,...,,Navi Mumbai,Navi Mumbai,Thane Taluka,Thane Taluka,Maharashtra,Maharashtra,"Sanpada Station Road, Sanpada, Navi Mumbai - 4...","Ghansoli Bus Depot, Ghansoli, Navi Mumbai, Tha...",6.483400


In [8]:
try:
    print("Using GPU-accelerated KMeans from cuml")
    kmeans = KMeans(n_clusters=5, random_state=108)
    kmeans.fit(df[['Sstate', 'Dsstate']])
    df['cluster'] = kmeans.labels_
except:
    print("Error occured")
print("K-means clustering completed")

# xgboost
print("Training XGBoost model ...")
features = ['osrm_time','haversine_distance','start_scan_to_end_scan', 'osrm_distance', 'segment_actual_time','segment_osrm_time','segment_osrm_distance']
X = df[features]
y = df[['actual_distance_to_destination','actual_time']]
model = XGBRegressor(random_state=108)  # Enable GPU support
model.fit(X, y)
print("XGBoost model training completed")
# Assuming 'model' is your trained model
#joblib.dump(model, 'route-optmiz.joblib')

#train calc
xtrp = model.predict(X)
trmse = np.sqrt(mean_squared_error(y, xtrp))
tr2 = r2_score(y, xtrp)
#testing
td = ddf[ddf['data'] == 'test']
X_te = td[features]
y_te = td[['actual_distance_to_destination','actual_time']]
y_pred = model.predict(X_te)
rmse = np.sqrt(mean_squared_error(y_te, y_pred))
r2 = r2_score(y_te, y_pred)
#joblib.dump(model, 'route-optmiz.joblib')
print(f"Model Evaluation:\nRMSE: {rmse:.10f}\nR² Score: {r2:.10f}\n\n")
print(f"Train RMSE: {trmse:.4f}, Test RMSE: {rmse:.4f}")
print(f"Train R²: {tr2:.4f}, Test R²: {r2:.4f}")

Using GPU-accelerated KMeans from cuml
Error occured
K-means clustering completed
Training XGBoost model ...
XGBoost model training completed
Model Evaluation:
RMSE: 13.2522028405
R² Score: 0.9354277849


Train RMSE: 0.2970, Test RMSE: 13.2522
Train R²: 0.9993, Test R²: 0.9354
