In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.linear_model import LinearRegression
#from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR,LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



In [None]:
from google.colab import drive

drive.mount('/content/drive',force_remount=True)

Data_Path="/content/drive/MyDrive/"


Mounted at /content/drive


In [None]:
train_data_csv=pd.read_csv(os.path.join(Data_Path,"train_examples.csv"))
train_labels=pd.read_csv(os.path.join(Data_Path,"train_labels.csv"))
test_data_csv=pd.read_csv(os.path.join(Data_Path,"test_examples.csv"))


In [None]:
print (train_data_csv.shape)
print (train_labels.shape)
print (test_data_csv.shape)

(400000, 12)
(400000, 2)
(100000, 12)


In [None]:
train_data_csv.drop(columns=["feature_10","feature_5"], inplace=True)
test_data_csv.drop(columns=["feature_10","feature_5"], inplace=True)

In [None]:
train_data_csv["feature_0"] = pd.to_datetime(train_data_csv["feature_0"], format='%m-%d %H:%M:%S')
test_data_csv["feature_0"] = pd.to_datetime(test_data_csv["feature_0"], format='%m-%d %H:%M:%S')

def is_rush_hour(row):
    hour = row["feature_0"].hour
    if (7 <= hour < 10) or (15 <= hour < 19):
          return 1
    return 0

train_data_csv['feature_0']=train_data_csv.apply(is_rush_hour, axis=1)
test_data_csv['feature_0']=test_data_csv.apply(is_rush_hour, axis=1)

In [None]:
#unique_train_values = train_data_csv['feature_5'].unique()
#unique_test_values = test_data_csv['feature_5'].unique()

#print(unique_train_values, unique_test_values)



In [None]:
categorical_features = ['feature_0','feature_1']
numerical_features = ['feature_2', 'feature_3', 'feature_4', 'feature_6', 'feature_7', 'feature_8', 'feature_9']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
X = train_data_csv.drop(columns=['id'])
y=train_labels['duration']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svr',LinearSVR())
])


In [None]:
model.fit(X_train, y_train)

In [None]:

#grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error',cv=5)
#grid_search.fit(X_train, y_train)

#best_model = grid_search.best_estimator_
#best_model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)

print(f"Mean Absolute Error on validation set: {mae}")

Mean Absolute Error on validation set: 495.435829461524


In [None]:
test_preds = model.predict(test_data_csv.drop(columns=['id']))



In [None]:
output = pd.DataFrame({'id': test_data_csv['id'], 'duration': test_preds})

print(output)

output.to_csv('submission.csv', index=False)

          id     duration
0          0   240.087469
1          1   420.628224
2          2   516.024850
3          3   297.125474
4          4   818.282818
...      ...          ...
99995  99995   161.577270
99996  99996   305.482787
99997  99997   237.548456
99998  99998  1173.277173
99999  99999  1328.166536

[100000 rows x 2 columns]
