In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Milestone 1 (1)**: Data has been loaded

In [None]:
#STEPS for the project
#1. EDA + Preprocessing
#2. Extract input and output columns (X&y)
#3. Train test split
#4. Pipeline
#5. Train the model + Hyperparameter Tuning
#6. Evalute the model/model selection

In [None]:
trdf = pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv")
#Dataframe for train.csv

# EDA

**Milestone 1 (2)**:Shape of train.csv => (175000, 17)

In [None]:
trdf.shape #size of training dataset

**Milestone 1 (3)**: numerical as well as categorical columns are present in the dataset.

In [None]:
trdf.head() #first 5 rows

In [None]:
trdf.sample(5)#5 random rows

In [None]:
trdf.info() #Info about the train.csv dataframe on data types and null values

**Milestone 1 (5)**: Null values are present.

In [None]:
trdf.isnull().sum() #Number of null values column wise 

In [None]:
trdf.describe() #(Mathematical)Discrpition of train df;

trip distance 0; some amounts are in negative

Mean trip distance is about 5; max is more than 1.35 lakh!

Total amount max is 587, min is a negative value which could be due to offer/credit

In [None]:
trdf.duplicated().sum() #Checking for duplicate values; No duplicates

**Univeriate Analysis**

In [None]:
import seaborn as sns
import matplotlib as plt
#Univariate Analysis
trdf['VendorID'].value_counts().plot(kind='pie',autopct='%.2f') 

In [None]:
trdf['passenger_count'].value_counts().plot(kind='bar') 
#Most passengers are travelling alone

In [None]:
trdf['payment_type'].value_counts().plot(kind='bar')
#Credit Card is the most preferred 

In [None]:
#Density plot for total_amount, target variable
trdf.total_amount.plot.density(color='blue')
plt.pyplot.xlim(-600,600)
plt.pyplot.show()

Normal Distribution for total_amount

In [None]:
sns.boxplot(trdf['trip_distance'], orient='horizontal')
plt.pyplot.title('A boxplot depicting trip distance distribution')

In [None]:
#Mean and Median trip distance values
print(trdf['trip_distance'].median(),",",trdf['trip_distance'].mean())

In [None]:
#10 longest trips
trdf['trip_distance'].nlargest(10)

Outliers are present

In [None]:
trdf['trip_distance'].nsmallest(10)

trip distance shouldn't be zero!

In [None]:
#Replacing the trips with 0 distance covered
trdf['trip_distance'].replace(0, np.nan,inplace= True)

#Replacing the super long trips
median = trdf.loc[trdf['trip_distance'] < 9673, 'trip_distance'].median()
trdf.loc[trdf.trip_distance >= 9673 , 'trip_distance'] = np.nan

In [None]:
trdf['trip_distance'].nsmallest(10)

In [None]:
sns.boxplot(trdf['trip_distance'], orient='horizontal')
plt.pyplot.title('A boxplot depicting trip distance distribution')
#Now showing more sensible distances

In [None]:
trdf['RatecodeID'].value_counts()
#just 6 types

In [None]:
trdf['congestion_surcharge'].value_counts()
# 3 types; even negative

In [None]:
trdf['improvement_surcharge'].value_counts()

In [None]:
trdf['Airport_fee'].value_counts()

In [None]:
trdf['tolls_amount'].value_counts()
#0 for most 

In [None]:
#converting datetime columns to datetime dtype
import datetime
trdf['tpep_pickup_datetime']=pd.to_datetime(trdf['tpep_pickup_datetime'])
trdf['tpep_dropoff_datetime']=pd.to_datetime(trdf['tpep_dropoff_datetime'])

In [None]:
trdf['tpep_pickup_datetime']

In [None]:
trdf['tpep_pickup_datetime'].nlargest() 

In [None]:
trdf['tpep_pickup_datetime'].nsmallest()
# 28 June (3:30PM) to 1 July 2023(1AM)

In [None]:
trdf['tpep_dropoff_datetime'].nsmallest()

In [None]:
trdf.select_dtypes(exclude="number") #Non numerical columns
#Some dropoff time occur before pickup! 

In [None]:
trdf.select_dtypes(exclude="number").columns

In [None]:
trdf = trdf.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'],axis=1) #dropping pickup and dropoff time cols

**Checking Correlation b/w numerical columns**

In [None]:
# Correlation b/w numerical columns; 
Ctrdf = trdf.copy()
Ctrdf = Ctrdf.drop(['store_and_fwd_flag','payment_type'],axis=1)
Ctrdf.corr()['total_amount']
#total amount high +ve corr with trip_distance, airport fee, tip amount and tolls amount
#also significant +ve corr with improvement surcharge and extra
#significant -ve corr with congestion surcharge
#also for passender count and VendorID 
#About 10 significant features

total amount high +ve corr with trip_distance

# Bivariate Analysis

In [None]:
sns.scatterplot(x=trdf['trip_distance'],y=trdf['total_amount'])
#total amt seems to increase with distance for most points

In [None]:
sns.scatterplot(x=trdf['tip_amount'],y=trdf['total_amount'])

In [None]:
sns.scatterplot(x=trdf['tolls_amount'],y=trdf['total_amount'])
#An increase with tolls is visible

In [None]:
sns.barplot(x=trdf['congestion_surcharge'],y=trdf['total_amount'])

For -ve congetion surcharge amt is -ve

In [None]:
sns.scatterplot(x=trdf['PULocationID'],y=trdf['total_amount'])

In [None]:
sns.scatterplot(x=trdf['DOLocationID'],y=trdf['total_amount'])

In [None]:
sns.scatterplot(x=trdf['store_and_fwd_flag'],y=trdf['total_amount'])

In [None]:
sns.heatmap(Ctrdf.corr())

No Multicollinearity.

# Feature Matrix and Target

In [None]:
#feature
X = trdf.drop(columns=['total_amount'])
X.info()

In [None]:
#target
y = trdf['total_amount']

# Train Test Split

In [None]:
#test_size = 0.1 is working better than 0.2
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 42 )

In [None]:
X_train.info()

# **Preprocessing Pipeline**

**missing value (Simple imputer)--->ordinal encoding(non numerical cols)--->scaling--->feature selection**

In [None]:
#Imputer
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
#Encoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
oe = ColumnTransformer([('oe_flag_payment',OrdinalEncoder(),[4,7])],remainder='passthrough')


In [None]:
#Scaler
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler(feature_range=(0,13))

In [None]:
#Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_regression
fselect = SelectKBest(score_func=mutual_info_regression,k=10)#10 "best" features

In [None]:
#Pipeline
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imp', imp), 
    ('oe', oe),
    ('scale', scale),
    ('fselect', fselect)
])
pipe.fit(X_train, y_train) #fitting data into pipeline

In [None]:
X_train = pipe.transform(X_train) #Transforming X_train

# Model Training and Hyperparameter tuning

# Decision Tree

In [None]:
"""
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor(random_state=42)

params= {'min_samples_leaf':[2,4,6,8,10],
         'max_depth':[3,4,5,6]}
gscv_dt = GridSearchCV(dtree,
                    params,
                    cv=5)
gscv_dt.fit(X_train,y_train)
"""

# Random Forest

In [None]:
"""
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
rforest= RandomForestRegressor(random_state=42)
#n_estimators =
params = {"n_estimators":range(7,15)}
gscv_rf=GridSearchCV(rforest,
                  params)
gscv_rf.fit(X_train,y_train)
"""

# XGBoost

In [None]:

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor 

xgb_reg = XGBRegressor(random_state=42)

params= {'n_estimators':[10,20,50],
         'max_depth':[3,5,7,10]}
gscv_xgb = GridSearchCV(xgb_reg,
                    params,
                    cv=5)
gscv_xgb.fit(X_train,y_train)


In [None]:
#gscv_dt.score(X_train,y_train) #R2 Score for training set
#gscv_rf.score(X_train,y_train)
gscv_xgb.score(X_train,y_train)

DT:0.9088827936067406

RF:0.9884626602418145

XGB: 0.9690063831403725

In [None]:
X_val = pipe.transform(X_val) # Transforming the X_val

In [None]:
#gscv_dt.score(X_val,y_val) #Validation set R2 score
#gscv_rf.score(X_val,y_val)
gscv_xgb.score(X_val,y_val)

DT: 0.921430051545187

RF: 0.9518799012690147

XGB:0.9607999361966141

In [None]:
#gscv_xgb.best_score_

In [None]:
#gscv_dt.best_estimator_
#gscv_rf.best_estimator_
gscv_xgb.best_estimator_

In [None]:
#gscv_dt.best_params_
#gscv_rf.best_params_
gscv_xgb.best_params_

# Model Evaluation

In [None]:
#y_val_pred = gscv_dt.predict(X_val) 
#y_val_pred = gscv_rf.predict(X_val)
y_val_pred = gscv_xgb.predict(X_val)

In [None]:
#Model Evaluation through Metrics
#On Validation Set
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
print("MAE:",mean_absolute_error(y_val,y_val_pred))
print("MSE",mean_squared_error(y_val,y_val_pred))
print("RMSE",np.sqrt(mean_squared_error(y_val,y_val_pred))) #Root mean squared error
print("R2:",r2_score(y_val,y_val_pred))
#XGB is giving the best results
#Increasing n_estimators doesn't necessarily improve performance
#Same goes for max_depth
#All errors are min for XGB and R2 score is max 

**DT**: MAE: 4.239880507435238          

MSE: 52.326559732921524

RMSE: 7.233709956372423

R2:0.921430051545187

**RF**: MAE: 2.992737238095239

MSE 32.04735741998237

RMSE 5.661038546060464

R2: 0.9518799012690147

**XGB**: MAE: 2.6551659122873037

MSE 26.106730632792686

RMSE 5.109474594593135

R2: 0.9607999361966141

In [None]:
#On Training Set
print("MAE:",mean_absolute_error(y_train,gscv_xgb.predict(X_train)))
print("MSE",mean_squared_error(y_train,gscv_xgb.predict(X_train)))
print("RMSE",np.sqrt(mean_squared_error(y_train,gscv_xgb.predict(X_train)))) #Root mean squared error
#print("R2:",r2_score(y_val,y_va))

In [None]:
plt.pyplot.plot([4.239880507435238,7.233709956372423,52.326559732921524,],marker='d')
plt.pyplot.plot([2.992737238095239,5.661038546060464,32.04735741998237],marker='x')
plt.pyplot.plot([2.6551659122873037,5.109474594593135,26.106730632792686],marker='+')
plt.pyplot.legend(['DT','RF','XGB']) 
plt.pyplot.title("Metrics:MAE, RMSE, MSE")

The **lower** the **better**

In [None]:
plt.pyplot.bar(['DT','RF','XGB'],[0.921430051545187,0.9518799012690147,0.9607999361966141])

plt.pyplot.title("R2 Score")

The **higher** the **better**

# Working on Test Data

In [None]:
#X_test
tedata = pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")

Null Values are present

In [None]:
tedata.isnull().sum()

In [None]:
tedata[:5]
# Again pickup after dropoff!

In [None]:
tedata = tedata.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'],axis=1)

In [None]:
tedata['trip_distance'].nlargest()

1 Value exceeds 17k

In [None]:
tedata['trip_distance'].nsmallest()

0 unit trip distances are present

**A bit of Cleaning on Test data**

In [None]:
#Replacing the trips with 0 distance covered
tedata['trip_distance'].replace(0, np.nan,inplace= True)

In [None]:
#Replacing the super long trips
median = tedata.loc[tedata['trip_distance'] < 17624 , 'trip_distance'].median()
tedata.loc[tedata.trip_distance >= 17624, 'trip_distance'] = np.nan

In [None]:
tedata['trip_distance'].nsmallest(), tedata['trip_distance'].nlargest()

In [None]:
tedata = pipe.transform(tedata) #Transforming the testdata through pipeline 

# Output Array

In [None]:
#y_pred
#output = gscv_dt.predict(tedata) 
#output = gscv_rf.predict(tedata)
output = gscv_xgb.predict(tedata)

In [None]:
output

In [None]:
output.shape

# Final Submission File

In [None]:
#Converting numpy array to a suitable dataframe
submission = pd.DataFrame(columns = ["ID","total_amount"])
submission["ID"] = [i for i in range(1,len(output)+1)]
submission["total_amount"] = output
submission.to_csv('submission.csv',index=False)


In [None]:
#Density plot for y_pred
submission.total_amount.plot.density(color='blue')
#plt.pyplot.xlim(-600,600)
plt.pyplot.show()