In [154]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [155]:
# load your dataset into a pandas DataFrame
df = pd.read_excel('C:\\Users\\deepa\\Downloads\\t.xlsx')
# define the columns containing string values
string_cols = ['Airline', 'Date_of_Journey', 'Source','Destination','Route','Dep_Time','Arrival_Time','Duration','Total_Stops','Additional_Info']

# create a LabelEncoder object for each string column
label_encoders = {}
for col in string_cols:
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
    label_encoders[col] = le

# save the label encoders for future use
import pickle
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [156]:
# separate the price column from the rest of the data
X = df.drop('Price', axis=1)
y = df['Price']

In [157]:
X

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,3,24,0,5,18,211,233,240,4,8
1,1,6,3,0,84,31,906,336,1,8
2,4,43,2,1,118,70,413,106,1,8
3,3,10,3,0,91,164,1324,311,0,8
4,3,0,0,5,29,149,1237,303,0,8
...,...,...,...,...,...,...,...,...,...,...
10678,0,41,3,0,64,183,1259,236,4,8
10679,1,29,3,0,64,193,1305,237,4,8
10680,4,29,0,2,18,58,824,280,4,8
10681,10,0,0,5,18,92,938,238,4,8


In [158]:
y

0         3897
1         7662
2        13882
3         6218
4        13302
         ...  
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10683, dtype: int64

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [160]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [161]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=42)

In [162]:
#Scaling to perform logistic regression
# create a StandardScaler object 
scaler = StandardScaler()

# fit the scaler to the data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# transform the test data using the scaler
X_test_scaled = scaler.transform(X_test)

In [163]:
lr = LogisticRegression(random_state=42, max_iter=5000,C = 1.0, penalty = 'l2',solver = 'liblinear')

In [164]:
lr.fit(X_train_scaled,y_train)

LogisticRegression(max_iter=5000, random_state=42, solver='liblinear')

In [165]:
y_pred_lr=lr.predict(X_test_scaled)

In [166]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print('Mean squared error:', mse_lr)
print("RMSE:", rmse_lr)
print("MAE:", mae_lr)
print("R2 score:", r2_lr)

Mean squared error: 15938275.350491343
RMSE: 3992.276963149143
MAE: 2413.87084698175
R2 score: 0.246914528753283


In [167]:
svm = SVC(kernel='rbf', random_state=42, C = 1.0, gamma = 'scale', probability = True)

In [168]:
svm.fit(X_train,y_train)

SVC(probability=True, random_state=42)

In [169]:
y_pred_rf = rf.predict(X_test)

In [170]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print('Mean squared error:', mse_rf)
print("RMSE:", rmse_rf)
print("MAE:", mae_rf)
print("R2 score:", r2_rf)

Mean squared error: 2415641.287413655
RMSE: 1554.2333439395948
MAE: 786.2844852578529
R2 score: 0.8858606519657815


In [171]:
y_pred_svm = svm.predict(X_test)

In [172]:
mse_svm = mean_squared_error(y_test, y_pred_svm)
rmse_svm = np.sqrt(mse_svm)
mae_svm = mean_absolute_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)
print('Mean squared error:', mse_svm)
print("RMSE:", rmse_svm)
print("MAE:", mae_svm)
print("R2 score:", r2_svm)

Mean squared error: 21850889.50163781
RMSE: 4674.49350214949
MAE: 3470.441272812354
R2 score: -0.032457217335849275


An R2 score of -0.032 indicates that your model is not performing well and is worse than a model that always predicts the mean value of the target variable. A negative R2 score means that the model's predictions are even worse than the mean value of the target variable.

You may need to investigate the data, the features, and the model to see what might be causing this poor performance and make necessary adjustments.

In [173]:
# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [2, 4, 8, None],
    'max_features': ['auto', 'sqrt']
}

# Create a Random Forest Classifier object
rfr = RandomForestRegressor()


# Use Grid Search to find the best hyperparameters
grid_search = GridSearchCV(rfr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


# print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score:", test_score)

Best parameters: {'max_depth': None, 'max_features': 'auto', 'n_estimators': 50}
Best score: 0.8528499207292464
Test score: 0.8893080059262624


In [174]:
#Let's stick with these parameters for our training model and import our test dataset now.
df2 = pd.read_excel('C:\\Users\\deepa\\Downloads\\t.xlsx')
string_cols = ['Airline', 'Date_of_Journey', 'Source','Destination','Route','Dep_Time','Arrival_Time','Duration','Total_Stops','Additional_Info']

# create a LabelEncoder object for each string column
label_encoders = {}
for col in string_cols:
    le = LabelEncoder()
    le.fit(df2[col])
    df2[col] = le.transform(df2[col])
    label_encoders[col] = le

# save the label encoders for future use
import pickle
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [175]:
X_pred=df2

In [176]:
rf = RandomForestRegressor(n_estimators=500, random_state=42)

In [177]:
rf.fit(X,y)

RandomForestRegressor(n_estimators=500, random_state=42)

In [178]:
y_pred_rf=rf.predict(X_pred)

ValueError: X has 11 features, but RandomForestRegressor is expecting 10 features as input.

In [None]:
y_pred_rf

In [None]:
lr.fit(X,y)

In [None]:
y_pred_lr=lr.predict(X_pred)

In [None]:
y_pred_lr

In [None]:
svm.fit(X,y)

In [None]:
y_pred_svm=svm.predict(X_pred)

In [None]:
y_pred_svm