### Importing Libraries

In [None]:
pip install seaborn scikit-learn tensorflow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("/kaggle/input/cost-prediction-for-logistic-company-2023w-aml1413/train.csv")

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
def find_unique(df, col):
#     return {"values": df[col].unique(), "no_values": df[col].nunique()}
    return df[col].unique()

In [None]:
find_unique(df, "packageType")

In [None]:
find_unique(df, "type")

In [None]:
find_unique(df, "exWeatherTag")

In [None]:
def fill_nans(df, col, value):
    return df[col].fillna(value)

In [None]:
df["exWeatherTag"] = fill_nans(df, "exWeatherTag", "normal")

In [None]:
find_unique(df, "exWeatherTag")

In [None]:
df["type"] = fill_nans(df, "type", "normal")

In [None]:
df["packageType"] = fill_nans(df, "packageType", "normal")

In [None]:
df['cost'].describe()

In [None]:
sns.histplot(df['cost'])

In [None]:
sns.boxplot(df['cost'])
plt.title("Boxplot of Cost")
plt.xlabel("Cost")

In [None]:
df

In [None]:
# q1=df['cost'].quantile(0.25)
# q3=df['cost'].quantile(0.75)
# IQR=q3-q1
# df_cost = df[~((df['cost']<(q1-1.5*IQR)) | (df['cost']>(q3+1.5*IQR)))]

In [None]:
# sns.histplot(df['cost'])

In [None]:
# df_cost['cost'].describe()

In [None]:
# df=df_cost.copy()

In [None]:
df['year']=df['date'].apply(lambda x:x.split("-")[0])

In [None]:
df['month']=df['date'].apply(lambda x:x.split("-")[1])

In [None]:
df['day']=df['date'].apply(lambda x:x.split("-")[2])

In [None]:
df.drop(['date'], inplace=True, axis=1)

In [None]:
categorical_cols=['dayPart','exWeatherTag','originLocation','destinationLocation','carrier', 'type', 'packageType', 'year']

In [None]:
df['dayPart'].unique()

In [None]:
df['originLocation'].unique()

In [None]:
df['destinationLocation'].unique()

In [None]:
df['carrier'].unique()

In [None]:
df[categorical_cols]

#### Using Label Encoder for converting categorical data into numbers for train data

In [None]:
df_enocder = df.copy()

encoder = LabelEncoder()

for i in categorical_cols:
    encoded_col  = encoder.fit_transform(df[i])
    df[i]  = encoded_col

In [None]:
df

In [None]:
# df_dummy = df.copy()

# df_dummies = pd.DataFrame()

# for i in categorical_cols:
#     df_temp=pd.get_dummies(df[i],drop_first=True,prefix=i)
#     df_dummies=pd.concat([df_dummies, df_temp], axis=1)
# df_dummies
# df.drop(categorical_cols, inplace=True, axis=1)
# df_dummy=pd.concat([df, df_dummy], axis=1)
# df_dummy

In [None]:
df.drop(['trip'], inplace=True, axis=1)

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True, linewidths=.5)

In [None]:
col = list(df.columns)

In [None]:
len(col)

In [None]:
df['distance'] = df['distance'] / 100

### Defining Target and Feature Variables

In [None]:
X=df.drop(['cost'], axis=1)
y=df['cost']

## Feature Selection
### 1) Variance threashold

In [None]:
# setting the  variance threashold to 0.01
selector = VarianceThreshold(threshold=0.1) 
# performing the fit transform method
selector.fit(X)

# fatching those columns which have variance greater than threashold
X.columns[selector.get_support()]

# selecting those columns which have variance less than threashold
selected_cols = [column for column in X.columns if column not in X.columns[selector.get_support()]]

# dropping selected cols
X_vt = X.drop(labels=selected_cols,axis=1)

### 2) Recursive Feature Elimination using Decision Tree

In [None]:

tree=DecisionTreeRegressor()
rfe = RFE(estimator=tree, n_features_to_select=8)

rfe.fit(X, y)
selected_col=[]

for i, col in zip(range(X.shape[1]), X.columns):
    if rfe.support_[i]:
        selected_col.append(col)
    print(f"{col} selected={rfe.support_[i]} rank={rfe.ranking_[i]}")

X_rfe = X[selected_col]

### Decision Tree Regresor's model training using Variance Threshold's Features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vt,y,test_size=0.33)

dec_tree = DecisionTreeRegressor()

parameters = {"max_depth": [3, 5,None],
              "max_features": ['sqrt','log2',None],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [2, 3, 10]}

clf_GS_vt = GridSearchCV(estimator=dec_tree , param_grid=parameters, error_score="raise")
clf_GS_vt.fit(X_train, y_train)

In [None]:
clf_GS_vt.score(X_train,y_train)

In [None]:
clf_GS_vt.score(X_test,y_test)

In [None]:
clf_GS_vt.best_params_

In [None]:
clf_GS_vt.best_score_

In [None]:
X_train.columns

### Decision Tree Regresor's model training using RFE's Features 

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_rfe,y,test_size=0.33)

dec_tree = DecisionTreeRegressor()

parameters = {"max_depth": [3, 5,None],
                "max_features": ['sqrt','log2',None],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [2, 3, 10]}

clf_GS_rfe = GridSearchCV(estimator=dec_tree , param_grid=parameters, error_score="raise")

clf_GS_rfe.fit(X1_train, y1_train)

In [None]:
clf_GS_rfe.best_score_

In [None]:
clf_GS_rfe.best_params_

In [None]:
clf_GS_rfe.score(X1_train,y1_train)

In [None]:
clf_GS_rfe.score(X1_test,y1_test)

### Test Dataset

In [None]:
df_test=pd.read_csv("/kaggle/input/cost-prediction-for-logistic-company-2023w-aml1413/test.csv")

In [None]:
df_test['year']=df_test['date'].apply(lambda x:x.split("-")[0])
df_test['month']=df_test['date'].apply(lambda x:x.split("-")[1])
df_test['day']=df_test['date'].apply(lambda x:x.split("-")[2])
df_test.drop(['date'], inplace=True, axis=1)

#### Using Label Encoder for converting categorical data into numbers for test data

In [None]:
encoder = LabelEncoder()

for i in categorical_cols:
    if i in df_test.columns:
        encoded_col  = encoder.fit_transform(df_test[i])
        df_test[i]  = encoded_col

In [None]:
df_test_vt = df_test[X_vt.columns]
y_pred_vt=clf_GS_vt.predict(df_test_vt)

In [None]:
sub_df_vt= pd.DataFrame()
sub_df_vt['trip']=df_test['trip']
sub_df_vt['cost']=y_pred_vt
sub_df_vt.to_csv("submission-vt.csv", index=False)
sub_df_vt

In [None]:
df_test_rfe=df_test[X_rfe.columns]
y_pred_rfe=clf_GS_rfe.predict(df_test_rfe)

In [None]:
sub_df_rfe= pd.DataFrame()
sub_df_rfe['trip']=df_test['trip']
sub_df_rfe['cost']=y_pred_rfe
sub_df_rfe.to_csv("submission-rfe.csv", index=False)
sub_df_rfe

### Model Training using Random Forest with recursively selected features

In [None]:
rfr = RandomForestRegressor()

X2_train, X2_test, y2_train, y2_test = train_test_split(X_rfe,y,test_size=0.33)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf_GS_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3)

clf_GS_rfr.fit(X2_train, y2_train)

In [None]:
clf_GS_rfr.best_score_

In [None]:
clf_GS_rfr.best_params_

In [None]:
clf_GS_rfr.score(X2_train,y2_train)

In [None]:
clf_GS_rfr.score(X2_test,y2_test)

In [None]:
df_test_rfr=df_test[X_rfe.columns]
y_pred_rfr=clf_GS_rfr.predict(df_test_rfr)

In [None]:
sub_df_rfr= pd.DataFrame()
sub_df_rfr['trip']=df_test['trip']
sub_df_rfr['cost']=y_pred_rfr
sub_df_rfr.to_csv("submission-rfr.csv", index=False)
sub_df_rfr

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(X_rfe['distance'])
plt.title("Boxplot of Distance")
plt.ylabel("Distance")
plt.xlabel("Boxplot")

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(X_rfe['weight'])
plt.title("Boxplot of Weight")
plt.ylabel("Weight")
plt.xlabel("Boxplot")

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(X_rfe['weight'])
plt.title("Histogram of Weight")
plt.ylabel("Count")

### ANN

#### Creating ANN model with 4 hidden layer with relu activation function

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

seq_model = Sequential()

#Input Layer
seq_model.add(Dense(X1_train.shape[1], kernel_initializer='normal', activation='relu', input_dim = X1_train.shape[1]))

#Hidden Layer
seq_model.add(Dense(512, kernel_initializer='normal', activation='relu'))
seq_model.add(Dense(256, kernel_initializer='normal', activation='relu'))
seq_model.add(Dense(128, kernel_initializer='normal', activation='relu'))
seq_model.add(Dense(64, kernel_initializer='normal', activation='relu'))
#Output Layer
seq_model.add(Dense(1,kernel_initializer='normal', activation = 'linear'))

In [None]:
seq_model.compile(loss = 'mean_squared_error', optimizer='adam', metrics=['mse'])
seq_model.summary()

### Converting data into float and numpy array 

In [None]:
X1_train = np.asarray(X1_train).astype(np.float32)
y1_train = np.asarray(y1_train).astype(np.float32)

### Spliting train data into validation and trining dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

X2_train, X2_val, y2_train, y2_val = train_test_split(X1_train, y1_train, random_state=42, test_size=0.2)

rs = MinMaxScaler()
X2_train_rs = rs.fit_transform(X2_train)
X1_test_rs = rs.fit_transform(X1_test)

#### Using Early Stopping and ReduceLROnPlateau 

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

es = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
rp = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)

history = seq_model.fit(X2_train_rs, y2_train, validation_data=(X2_val, y2_val), epochs=100, batch_size=32, callbacks=[es, rp])

In [None]:
seq_model.evaluate(X2_train_rs, y2_train)

In [None]:
X1_test = np.asarray(X1_train).astype(np.float32)
y1_test = np.asarray(y1_train).astype(np.float32)

In [None]:
seq_model.evaluate(X1_test, y1_test)

In [None]:
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['train'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['val_loss'])
plt.title('Validation Data Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['val'], loc='upper right')
plt.show()

In [None]:
df_test_seq=df_test[X_rfe.columns]

In [None]:
df_test_seq['distance'] = df_test_seq['distance'] / 100

In [None]:
df_test_seq_1 = np.asarray(df_test_seq).astype(np.float32)

In [None]:
df_test_seq_rs = rs.fit_transform(df_test_seq)

In [None]:
y_pred_seq = seq_model.predict(df_test_seq_rs)

In [None]:
sub_df_seq= pd.DataFrame()
sub_df_seq['trip']=df_test['trip']
sub_df_seq['cost']=y_pred_seq
sub_df_seq.to_csv("submission-seq.csv", index=False)
sub_df_seq

### Conclusion

The decision tree regressor employing variance threshold test had the lowest RMSE score of 2.54039, which indicates that it had the greatest performance in predicting the target variable among the tested models, according to the test findings.

While the ANN sequential model with 4 hidden layers had the best RMSE score, it performed poorly at predicting the target variable, with a score of 33.65638.

The RMSE values of 3.16044 and 7.95678 for the decision tree regressor using recursive feature elimination test and the random forest regressor using recursive feature elimination test, respectively, were comparable. The variance threshold test-based decision tree regressor, however, outperformed both of these models.

The decision tree regressor utilising the variance threshold test is therefore the most effective model among those tested for predicting the target variable.

This all conclusions are made on latest submission scores on kaggle competition, as this rmse score is driven from 35% of test dataset this can't be considered as final scores. 
