In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from ImbalancedLearningRegression import smote
from sklearn.metrics import r2_score, mean_squared_error

In [35]:
import pprint

In [36]:
bikes_test = pd.read_excel('bike_test.xlsx')
bikes_train = pd.read_excel('bike_train.xlsx')

In [37]:
bikes_test.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,12000,2012-05-20,2,1,5,4,0,0,0,1,0.52,0.5,0.68,0.0896
1,12001,2012-05-20,2,1,5,5,0,0,0,1,0.5,0.4848,0.72,0.1045
2,12002,2012-05-20,2,1,5,6,0,0,0,1,0.5,0.4848,0.63,0.1343
3,12003,2012-05-20,2,1,5,7,0,0,0,1,0.52,0.5,0.68,0.194
4,12004,2012-05-20,2,1,5,8,0,0,0,1,0.56,0.5303,0.56,0.1642


In [38]:
bikes_train.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [39]:
#in case i need to work with the dataframe so i don't touch the original
bikes_test_copy = bikes_test.copy()
bikes_train_copy = bikes_train.copy()

In [40]:
bikes_test_copy.shape, bikes_train_copy.shape

((5380, 14), (11999, 17))

In [41]:
bikes_train_copy.columns.unique()

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [42]:
bikes_test_copy.columns.unique()

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed'],
      dtype='object')

In [43]:
bikes_train_copy.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [44]:
bikes_train_copy.info()

#we can see we don't have any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11999 entries, 0 to 11998
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     11999 non-null  int64         
 1   dteday      11999 non-null  datetime64[ns]
 2   season      11999 non-null  int64         
 3   yr          11999 non-null  int64         
 4   mnth        11999 non-null  int64         
 5   hr          11999 non-null  int64         
 6   holiday     11999 non-null  int64         
 7   weekday     11999 non-null  int64         
 8   workingday  11999 non-null  int64         
 9   weathersit  11999 non-null  int64         
 10  temp        11999 non-null  float64       
 11  atemp       11999 non-null  float64       
 12  hum         11999 non-null  float64       
 13  windspeed   11999 non-null  float64       
 14  casual      11999 non-null  int64         
 15  registered  11999 non-null  int64         
 16  cnt         11999 non-

In [45]:
bikes_train_copy.nunique()
#we can see 4 seasons, 12 months, 7 weekdays
#nothing out of the oridinary

instant       11999
dteday          506
season            4
yr                2
mnth             12
hr               24
holiday           2
weekday           7
workingday        2
weathersit        4
temp             48
atemp            65
hum              88
windspeed        30
casual          287
registered      623
cnt             713
dtype: int64

In [46]:
bikes_train_copy.isna().any()
#like we said before no missing values

instant       False
dteday        False
season        False
yr            False
mnth          False
hr            False
holiday       False
weekday       False
workingday    False
weathersit    False
temp          False
atemp         False
hum           False
windspeed     False
casual        False
registered    False
cnt           False
dtype: bool

In [47]:
#a piece of code to pair up correlation i found
#it transforms every value to abolute value, meaning negative to positive
#and then uses the unstack method to sort of transpose rows and columns
#it just works
corr = bikes_train_copy.corr()
corr_unstack = corr.abs().unstack()
corr_unstack.sort_values(ascending = False)[16:27:2]

atemp       temp      0.991785
registered  cnt       0.968540
season      mnth      0.865268
instant     yr        0.777284
cnt         casual    0.702414
registered  casual    0.503185
dtype: float64

Here we can see that atemp are temp and closely correlated so we can drop atemp since it's irrelevant
and it will only create noise

The instant column is just the ID number the daily report gets so that one has to be dropped

We can also drop instant because it's just an identifier

We can drop yr because it's irrelevant to people hiring rental bikes

Finally we can also drop the dteday column because they are the same every year and has nothing to do
with the issue, the only relevant data related to date is the day of the week,
if it is a working day or not and if it's a holiday

In [48]:
#the following columns have a set number of values so they can be turned into category types
#we can just copy the names in the results of the cell above
columns_to_category = ["dteday", "season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit"]
#the following fields after weathersit, like temp and hr, have too many variables
#so they can't be categories
for column in columns_to_category:
    bikes_train_copy[column] = bikes_train_copy[column].astype('category')

In [49]:
for column in columns_to_category:
    bikes_test_copy[column] = bikes_test_copy[column].astype('category')

In [50]:
bikes_train_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11999 entries, 0 to 11998
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   instant     11999 non-null  int64   
 1   dteday      11999 non-null  category
 2   season      11999 non-null  category
 3   yr          11999 non-null  category
 4   mnth        11999 non-null  category
 5   hr          11999 non-null  category
 6   holiday     11999 non-null  category
 7   weekday     11999 non-null  category
 8   workingday  11999 non-null  category
 9   weathersit  11999 non-null  category
 10  temp        11999 non-null  float64 
 11  atemp       11999 non-null  float64 
 12  hum         11999 non-null  float64 
 13  windspeed   11999 non-null  float64 
 14  casual      11999 non-null  int64   
 15  registered  11999 non-null  int64   
 16  cnt         11999 non-null  int64   
dtypes: category(9), float64(4), int64(4)
memory usage: 889.6 KB


In [51]:
#it would be nice if the columns were easily recognizable
#it's a good idea to rename them when neccesary
bikes_train_copy.rename(columns= {'dteday':'date', 'yr':'year', 'mnth':'month', 'hr': 'hour', 'weathersit': 'weather', 'temp': 'temperature_C', 'atemp': 'temperature_wind-chill_factor_C',  'hum':'humidity', 'cnt':'count'}, inplace= True)
bikes_train_copy.head()

Unnamed: 0,instant,date,season,year,month,hour,holiday,weekday,workingday,weather,temperature_C,temperature_wind-chill_factor_C,humidity,windspeed,casual,registered,count
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [52]:
bikes_test_copy.rename(columns= {'dteday':'date', 'yr':'year', 'mnth':'month', 'hr': 'hour', 'weathersit': 'weather', 'temp': 'temperature_C', 'atemp': 'temperature_wind-chill_factor_C',  'hum':'humidity', 'cnt':'count'}, inplace= True)
bikes_test_copy.head()

Unnamed: 0,instant,date,season,year,month,hour,holiday,weekday,workingday,weather,temperature_C,temperature_wind-chill_factor_C,humidity,windspeed
0,12000,2012-05-20,2,1,5,4,0,0,0,1,0.52,0.5,0.68,0.0896
1,12001,2012-05-20,2,1,5,5,0,0,0,1,0.5,0.4848,0.72,0.1045
2,12002,2012-05-20,2,1,5,6,0,0,0,1,0.5,0.4848,0.63,0.1343
3,12003,2012-05-20,2,1,5,7,0,0,0,1,0.52,0.5,0.68,0.194
4,12004,2012-05-20,2,1,5,8,0,0,0,1,0.56,0.5303,0.56,0.1642


In [53]:
#here we are dropping the columns mentioned above
bikes_train_copy = bikes_train_copy.drop(columns=['instant', 'temperature_wind-chill_factor_C', 'date', 'year'],axis=1)

In [54]:
bikes_test_copy = bikes_test_copy.drop(columns=['instant', 'temperature_wind-chill_factor_C', 'date', 'year'],axis=1)

In [55]:
bikes_train_copy.head()

Unnamed: 0,season,month,hour,holiday,weekday,workingday,weather,temperature_C,humidity,windspeed,casual,registered,count
0,1,1,0,0,6,0,1,0.24,0.81,0.0,3,13,16
1,1,1,1,0,6,0,1,0.22,0.8,0.0,8,32,40
2,1,1,2,0,6,0,1,0.22,0.8,0.0,5,27,32
3,1,1,3,0,6,0,1,0.24,0.75,0.0,3,10,13
4,1,1,4,0,6,0,1,0.24,0.75,0.0,0,1,1


In [56]:
bikes_test_copy.head()

Unnamed: 0,season,month,hour,holiday,weekday,workingday,weather,temperature_C,humidity,windspeed
0,2,5,4,0,0,0,1,0.52,0.68,0.0896
1,2,5,5,0,0,0,1,0.5,0.72,0.1045
2,2,5,6,0,0,0,1,0.5,0.63,0.1343
3,2,5,7,0,0,0,1,0.52,0.68,0.194
4,2,5,8,0,0,0,1,0.56,0.56,0.1642


In [57]:
data_bikes_copy = bikes_train_copy.copy()

In [58]:
#X is going to be the dataset minus the 3 columns with rentals, casual, registered and count
#y is going to be those casual and registered, not count since it's just the sum of the previous two
X = data_bikes_copy.drop(columns=['casual', 'registered', 'count'])
y = data_bikes_copy[['casual', 'registered']]
X.shape, y.shape

((11999, 10), (11999, 2))

In [59]:
#here we just split the sets like always
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

trying random forest

In [60]:
#we are just declaring the model
random_forest_model = RandomForestRegressor(random_state=42)

In [61]:
#we just use fit and predict like in any model
random_forest_model.fit(X_train, y_train)
random_forest_model_prediction = random_forest_model.predict(X_test)

In [62]:
#we get each score and multiply the r2 score so it's a number between 0 and 100
print('R2 Score: ', r2_score(y_test, random_forest_model_prediction)*100)
print('RMSE: ', np.sqrt(mean_squared_error(y_test, random_forest_model_prediction)))

R2 Score:  86.7901673924033
RMSE:  33.413251247040826


In [63]:
random_forest_model_prediction_train = random_forest_model.predict(X_train)

In [64]:
print('R2 Score: ', r2_score(y_train, random_forest_model_prediction_train)*100)
print('RMSE: ', np.sqrt(mean_squared_error(y_train, random_forest_model_prediction_train)))

R2 Score:  98.12943890432346
RMSE:  12.870783646421817


In [67]:
X.shape, bikes_test_copy.shape

((11999, 10), (5380, 10))

In [68]:
random_forest_model_prediction_proper = random_forest_model.predict(bikes_test_copy)

In [69]:
random_forest_model_prediction_proper

array([[ 3.73,  8.44],
       [ 4.38,  8.09],
       [ 6.08, 12.53],
       ...,
       [ 5.69, 71.65],
       [ 4.03, 80.3 ],
       [ 2.97, 30.63]])

In [79]:
random_forest_model_prediction_proper_col_1 = [n[0] for n in random_forest_model_prediction_proper]
random_forest_model_prediction_proper_col_2 = [n[1] for n in random_forest_model_prediction_proper]
random_forest_model_prediction_proper_col_3 = [(n[0]+n[1]) for n in random_forest_model_prediction_proper]
random_forest_model_prediction_proper_col_3


[12.17,
 12.469999999999999,
 18.61,
 38.980000000000004,
 129.57,
 225.09000000000003,
 352.90999999999997,
 579.0799999999999,
 588.91,
 625.74,
 545.28,
 538.98,
 504.54999999999995,
 380.52,
 369.07,
 305.71,
 256.74,
 163.33,
 130.31,
 48.34,
 22.98,
 15.23,
 9.940000000000001,
 7.319999999999999,
 5.19,
 16.349999999999998,
 55.78,
 97.35000000000001,
 400.31,
 141.63,
 83.46000000000001,
 131.98,
 184.73000000000002,
 130.09,
 124.5,
 186.19,
 308.93,
 588.56,
 564.6800000000001,
 397.43,
 270.04,
 210.32,
 127.44999999999999,
 66.86,
 22.16,
 15.01,
 7.369999999999999,
 3.7,
 4.12,
 11.57,
 64.14,
 142.49,
 534.34,
 287.12,
 150.1,
 161.56,
 188.54000000000002,
 191.09,
 195.92,
 224.99,
 324.90000000000003,
 659.04,
 645.44,
 413.73,
 329.08000000000004,
 223.04000000000002,
 153.87,
 82.4,
 40.88,
 17.869999999999997,
 11.870000000000001,
 4.39,
 3.6500000000000004,
 14.2,
 121.78,
 337.45000000000005,
 614.1700000000001,
 273.71000000000004,
 161.33,
 170.64,
 211.74,
 199.7

In [75]:
prototype = pd.DataFrame()

In [80]:
prototype['casual'] = random_forest_model_prediction_proper_col_1
prototype['registered'] = random_forest_model_prediction_proper_col_2
prototype['count'] = random_forest_model_prediction_proper_col_3
prototype.head()

Unnamed: 0,casual,registered,count
0,3.73,8.44,12.17
1,4.38,8.09,12.47
2,6.08,12.53,18.61
3,11.91,27.07,38.98
4,33.44,96.13,129.57


In [81]:
# the same method we use for any other regressor
gradient_boost_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=2))

gradient_boost_model.fit(X_train, y_train)
gradient_boost_model_prediction = gradient_boost_model.predict(X_test)

In [82]:
print('R2 Score: ', r2_score(y_test, gradient_boost_model_prediction)*100)
print('RMSE: ', np.sqrt(mean_squared_error(y_test, gradient_boost_model_prediction)))

R2 Score:  80.26496263267187
RMSE:  40.90748365588644


In [83]:
gradient_boost_model_prediction_train = gradient_boost_model.predict(X_train)

In [84]:
print('R2 Score: ', r2_score(y_train, gradient_boost_model_prediction_train)*100)
print('RMSE: ', np.sqrt(mean_squared_error(y_train, gradient_boost_model_prediction_train)))

R2 Score:  81.51832469982017
RMSE:  40.57883787416542


In [85]:
prototype_count = pd.DataFrame()
prototype_count['pred'] = prototype['count']
prototype_count.to_csv('KevinG91.csv')