In [None]:
#Read Files And Make New CSV

import pandas as pd
import os

#Order Directory

order_directory = "E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\order_data"
columns = ["order_id", "driver_id", "passenger_id", "start_region_hash", "dest_region_hash", "price", "time"]

count=0
all_data = pd.DataFrame(columns=columns)
for filename in os.listdir(order_directory):
  print(count)  
  if filename.startswith("order_data_"):
    input_file = os.path.join(order_directory, filename)
    data = pd.read_csv(input_file, delimiter='\t', names=columns)
    all_data = pd.concat([all_data, data], ignore_index=True)
    count+=1

total_entries = len(all_data)
print(f"The total number of entries in all data files is: {total_entries}")

output_file = os.path.join("E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\", "order.csv")
all_data.to_csv(output_file, index=False)

print(f"Data from all files has been stored to {output_file}")

# Cluster Directory

map_directory="E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\cluster_map"
map_columns=["region_hash","region_id"]

map_input_file=os.path.join(map_directory,"cluster_map")
data = pd.read_csv(map_input_file, delimiter='\t', names=map_columns)
map_output_file = os.path.join("E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\", "cluster.csv")
data.to_csv(map_output_file, index=False)

print(f"Data from all files has been stored to {map_output_file}")


In [None]:

import csv

# Function to convert time string to time slot
def time_to_slot(time_str):
    hour, minute, _ = time_str.split()[1].split(':')
    hour = int(hour)
    minute = int(minute)
    return (hour * 60 + minute) // 10 + 1

# Load cluster data
cluster_data = {}
with open('E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\cluster.csv', 'r') as cluster_file:
    cluster_reader = csv.DictReader(cluster_file)
    for row in cluster_reader:
        cluster_data[row['region_hash']] = row['region_id']

# Initialize demand and supply dictionaries
demand = {}
supply = {}
for region_id in cluster_data.values():
    demand[region_id] = {slot: 0 for slot in range(1, 145)}
    supply[region_id] = {slot: 0 for slot in range(1, 145)}

# Process order data
with open('E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\order.csv', 'r') as order_file:
    order_reader = csv.DictReader(order_file)
    for row in order_reader:
        start_region_id = cluster_data.get(row['start_region_hash'])
        if start_region_id:
            time_slot = time_to_slot(row['time'])
            demand[start_region_id][time_slot] += 1
            if row['driver_id']:
                supply[start_region_id][time_slot] += 1

# Calculate demand-supply gap and write to output.csv
with open('E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\predict.csv', 'w', newline='') as output_file:
    fieldnames = ['Region', 'Time Slot', 'Demand', 'Supply', 'Demand-Supply Gap']
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for region_id in cluster_data.values():
        for slot in range(1, 145):
            demand_count = demand[region_id][slot]
            supply_count = supply[region_id][slot]
            gap = demand_count - supply_count
            writer.writerow({
                'Region': region_id,
                'Time Slot': slot,
                'Demand': demand_count,
                'Supply': supply_count,
                'Demand-Supply Gap': gap
            })


In [None]:
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore", category=UserWarning)

# Load the data
newdata = pd.read_csv("E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\predict.csv")

# Prepare the features and target variable
X = newdata[['Region', 'Time Slot']]
y = newdata['Demand-Supply Gap']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training R^2 Score:", train_score)
print("Testing R^2 Score:", test_score)

region_range = range(1, 67) 
time_slot_range = range(1, 145)
predictions = []
for region in region_range:
    for time_slot in time_slot_range:
        prediction = model.predict([[region, time_slot]])
        predictions.append((region, time_slot, prediction[0]))

best_prediction = min(predictions, key=lambda x: x[2])
print("Best Region and Time Slot:", best_prediction[:2])
print("Predicted Demand-Supply Gap:", best_prediction[2])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings("ignore", category=UserWarning)

# Load the data
newdata = pd.read_csv("E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\predict.csv")

# Feature Engineering: Extract hour and day of the week from the Time Slot
newdata['Hour'] = newdata['Time Slot'] % 24
newdata['DayOfWeek'] = newdata['Time Slot'] // (24 * 7)

# Prepare the features and target variable
X = newdata[['Region', 'Hour', 'DayOfWeek']]
y = newdata['Demand-Supply Gap']

# Initialize the regressor (Linear Regression)
regressor = LinearRegression()

# Train the model
regressor.fit(X, y)

# Calculate Mean Absolute Error (MAE)
y_pred = regressor.predict(X)
mae = mean_absolute_error(y, y_pred)
print("Mean Absolute Error (MAE):", mae/10)

# Generate predictions for every region with every time slot
predictions = []
for region in range(1, 67):
    for time_slot in range(1, 145):  # Assuming 144 time slots
        hour = (time_slot - 1) % 24
        day_of_week = (time_slot - 1) // (24 * 7)
        prediction = regressor.predict([[region, hour, day_of_week]])
        predictions.append({
            'Region ID': str(region),
            'Time slot': f'2016-01-23-{time_slot}',  # Assuming a specific date
            'Prediction value': prediction[0]/10
        })


predictions_df = pd.DataFrame(predictions)
print(predictions_df)
predictions_df.to_csv('E:\\PHAASHT\\Semester 6\\AI\\Assignment3\\training_data\\output.csv', index=False)


plt.scatter(y_pred, y, color='blue', label='Actual')
plt.plot(y_pred, y_pred, color='red', label='Regression Line')
plt.xlabel('Predicted Demand-Supply Gap')
plt.ylabel('Actual Demand-Supply Gap')
plt.title('Linear Regression Analysis')
plt.legend()
plt.show()

