# Using Naive Bayes to predict Count of total rental bikes

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np

In [2]:
# turns category data to numeric
def categoryToNum(dataSet):
    # import data
    d = dataSet
    # get column names of non numeric columns
    stringCol = d.select_dtypes(include='dtype').columns.values
    #setting dtype to category and converting category to numeric values
    for colName in stringCol:
        d[colName] = d[colName].astype('category')
        d[colName] = d[colName].cat.codes
    # run correlations and sort according to best
    dataCorr = abs(d.corr(method = 'pearson')['cnt'])
    dataCorr = dataCorr.drop(['cnt'])
    sortCorr = (dataCorr[~np.isnan(dataCorr)]).sort_values(ascending = False)
    return d, sortCorr

In [3]:
# select x factors to predict y with at least 0.80 accuracy
def selectXFeats(n):
    #n = number of features
    newData = pd.DataFrame()
    newData['cnt'] = d['cnt']
    for feat in range(n):
        newData[sortCorr.index[feat]] = d[sortCorr.index[feat]]
    return newData

In [11]:
# this one starts the thing
def dataSet(data, testProportion):
    x = data.drop(['cnt'], axis = 1)
    y = data['cnt']
    nbGaus(x, y, testProportion)

In [5]:
# function to get yTest (prediction) values
def nbGaus(x, y, testProportion = 0.8):
    XTrain, XTest, yTrain, yTest = train_test_split(x, y, test_size=testProportion)
    gausNB = GaussianNB()
    yPred = gausNB.fit(XTrain, yTrain).predict(XTrain)
    accuracyRate = accuracy(yPred, yTrain)
    if accuracyRate > 0.8:
        print('Train: \nAccuracy Rate:', accuracyRate *100, '% \nFactors used:', list(x.columns))
    else:
        #add next feat to xData
        data = selectXFeats(xdata.shape[1])
        dataSet(data, 0.8)

In [6]:
# function to get accuracy
def accuracy(yPred, yTrain):
    #accuracies = ((predictions == (df['passing'].values)).sum(axis=0)/len(df['passing']))
    #accuracy = (fit.predict(Xs) == y).sum()/y.shape[0]
    accuracyRate = ((yPred == yTrain).sum(axis=0)/len(yTrain))
    return accuracyRate

# Daily Data

In [7]:
dailyData = pd.read_csv('C:/Users/Jenny/Desktop/UOP-MSDS/2020Spring/ANLT_210_SoftwareMethods/Bikeshare/Data/daily.csv')

In [8]:
dailyData.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [14]:
dailyData.drop(['casual', 'registered'], axis = 1).head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,3,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,5,4,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


In [15]:
dailyData2 = dailyData.drop(['casual', 'registered'], axis = 1)

In [18]:
# getting categorical data to numeric
d, sortCorr = categoryToNum(dailyData2)
# starting data with one X feature
xdata = selectXFeats(1)

In [22]:
# runs everything
dataSet(xdata, 0.8)

Train: 
Accuracy Rate: 98.63013698630137 % 
Factors used: ['atemp']


# Hourly Data

In [23]:
hourlyData = pd.read_csv('C:/Users/Jenny/Desktop/UOP-MSDS/2020Spring/ANLT_210_SoftwareMethods/Bikeshare/Data/hourly.csv')

In [24]:
hourlyData2 = hourlyData.drop(['casual', 'registered'], axis = 1)

In [25]:
# getting categorical data to numeric
d2, sortCorr2 = categoryToNum(hourlyData2)
# starting data with one X feature
xdata2 = selectXFeats(1)

In [26]:
# runs everything
dataSet(xdata2, 0.8)

Train: 
Accuracy Rate: 98.63013698630137 % 
Factors used: ['atemp']


## What Naive Bayes tells Us
After runing Naive Bayes on both Daily and Hourly data set, 'atemp' shows that it predicts counts for total bike rentals with an accuracy rate of over 95%.

<font color = 'red'> Need to predict count of total bike rentals </font>