## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## Importing the dataset over here

In [2]:
data=pd.read_csv("orders.csv")

In [4]:
data.head()

Unnamed: 0,date,vendor_id,chain_id,city_id,spec,successful_orders,fail_orders
0,2019-07-02,40065,7501,23,Суши,54.0,1.0
1,2019-08-27,48058,33226,23,Шаурма,0.0,0.0
2,2019-09-25,35430,26220,25,Пицца,2.0,0.0
3,2019-09-21,56553,38601,23,Шашлыки,1.0,0.0
4,2019-09-21,43919,30984,25,Суши,6.0,0.0


## Taking care of duplicate observations over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

date                   0
vendor_id              0
chain_id               0
city_id                0
spec                 385
successful_orders      0
fail_orders            0
dtype: int64

In [7]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

spec


In [8]:
data[missing_values]

Unnamed: 0,spec
0,Суши
1,Шаурма
2,Пицца
3,Шашлыки
4,Суши
...,...
96113,Суши
96114,Пицца
96115,Вьетнамская
96116,Суши


In [9]:
data.drop("spec",axis=1,inplace=True)

In [10]:
data.isnull().sum()

date                 0
vendor_id            0
chain_id             0
city_id              0
successful_orders    0
fail_orders          0
dtype: int64

## Filtering all the numerical feature over here

In [11]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

vendor_id
chain_id
city_id
successful_orders
fail_orders


In [12]:
data[numerical_features]

Unnamed: 0,vendor_id,chain_id,city_id,successful_orders,fail_orders
0,40065,7501,23,54.0,1.0
1,48058,33226,23,0.0,0.0
2,35430,26220,25,2.0,0.0
3,56553,38601,23,1.0,0.0
4,43919,30984,25,6.0,0.0
...,...,...,...,...,...
96113,44315,31154,24,2.0,0.0
96114,17528,11182,25,11.0,1.0
96115,45351,31679,23,1.0,0.0
96116,64209,42525,25,7.0,0.0


## Filtering all the categorical features over here

In [13]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

date


In [14]:
data[cat_features]

Unnamed: 0,date
0,2019-07-02
1,2019-08-27
2,2019-09-25
3,2019-09-21
4,2019-09-21
...,...
96113,2019-08-18
96114,2019-06-15
96115,2019-08-19
96116,2019-09-26


## Encoding the categorical features into numerical features over here

In [15]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,date,vendor_id,chain_id,city_id,successful_orders,fail_orders
0,0,40065,7501,23,54.0,1.0
1,1,48058,33226,23,0.0,0.0
2,2,35430,26220,25,2.0,0.0
3,3,56553,38601,23,1.0,0.0
4,3,43919,30984,25,6.0,0.0
...,...,...,...,...,...,...
96113,61,44315,31154,24,2.0,0.0
96114,9,17528,11182,25,11.0,1.0
96115,70,45351,31679,23,1.0,0.0
96116,65,64209,42525,25,7.0,0.0


## Creating the features and labels over here

In [17]:
data['Successful_Orders']=data['successful_orders']
data.drop("successful_orders",axis=1,inplace=True)

In [18]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler


# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=0),
    'Random Forest': RandomForestRegressor(n_estimators=10, random_state=0),
    'Gradient Boosting': GradientBoostingRegressor(random_state=0)
}

# Train and evaluate models
r2_scores = {}
for name, model in models.items():
    if name == 'SVR':
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    r2_scores[name] = r2_score(y_test, y_pred)

# Find the best model
best_model_name = max(r2_scores, key=r2_scores.get)
best_r2_score = r2_scores[best_model_name]

# Print the R² scores and the best model
print("R² scores for each model:")
for name, score in r2_scores.items():
    print(f"{name}: {score:.4f}")

print(f"\nBest model: {best_model_name} with R² score of {best_r2_score:.4f}")




from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

R² scores for each model:
Linear Regression: 0.1586
Decision Tree: 0.7223
Random Forest: 0.8069
Gradient Boosting: 0.6451

Best model: Random Forest with R² score of 0.8069


## Evaluating the performance of the model on the testing dataset over here

In [23]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 0.3  2. ]
 [ 3.4  3. ]
 [ 1.6  1. ]
 ...
 [ 2.2  2. ]
 [14.5 17. ]
 [36.7 27. ]]


In [24]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8069146896982543