## Logistic regression

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [2]:
dataset2 = pd.read_csv("SeoulBikeData.csv",encoding='latin1') 
dataset2.head(2) # view the data

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [3]:
dataset2['Rented Bike Count'] = dataset2['Rented Bike Count'].astype(float)
dataset2['Hour'] = dataset2['Hour'].astype(float)
dataset2['Humidity(%)'] = dataset2['Humidity(%)'].astype(float)
dataset2['Visibility (10m)'] = dataset2['Visibility (10m)'].astype(float)

In [4]:
dataset2.dtypes

Date                          object
Rented Bike Count            float64
Hour                         float64
Temperature(°C)              float64
Humidity(%)                  float64
Wind speed (m/s)             float64
Visibility (10m)             float64
Dew point temperature(°C)    float64
Solar Radiation (MJ/m2)      float64
Rainfall(mm)                 float64
Snowfall (cm)                float64
Seasons                       object
Holiday                       object
Functioning Day               object
dtype: object

In [5]:
# remove the date
X_L= dataset2.iloc[:, 1:].values
print(X_L)

[[254.0 0.0 -5.2 ... 'Winter' 'No Holiday' 'Yes']
 [204.0 1.0 -5.5 ... 'Winter' 'No Holiday' 'Yes']
 [173.0 2.0 -6.0 ... 'Winter' 'No Holiday' 'Yes']
 ...
 [694.0 21.0 2.6 ... 'Autumn' 'No Holiday' 'Yes']
 [712.0 22.0 2.1 ... 'Autumn' 'No Holiday' 'Yes']
 [584.0 23.0 1.9 ... 'Autumn' 'No Holiday' 'Yes']]


In [6]:
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Encoding Functioning Day
Functioning = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
X_L = np.array(Functioning.fit_transform(X_L))
print(X_L)

[[0.0 1.0 254.0 ... 0.0 'Winter' 'No Holiday']
 [0.0 1.0 204.0 ... 0.0 'Winter' 'No Holiday']
 [0.0 1.0 173.0 ... 0.0 'Winter' 'No Holiday']
 ...
 [0.0 1.0 694.0 ... 0.0 'Autumn' 'No Holiday']
 [0.0 1.0 712.0 ... 0.0 'Autumn' 'No Holiday']
 [0.0 1.0 584.0 ... 0.0 'Autumn' 'No Holiday']]


In [7]:
# Encoding seasons 
seasons = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-2])], remainder='passthrough')
X_L = np.array(seasons.fit_transform(X_L))
print(X_L)

[[0.0 0.0 0.0 ... 0.0 0.0 'No Holiday']
 [0.0 0.0 0.0 ... 0.0 0.0 'No Holiday']
 [0.0 0.0 0.0 ... 0.0 0.0 'No Holiday']
 ...
 [1.0 0.0 0.0 ... 0.0 0.0 'No Holiday']
 [1.0 0.0 0.0 ... 0.0 0.0 'No Holiday']
 [1.0 0.0 0.0 ... 0.0 0.0 'No Holiday']]


In [8]:
# Encoding Holiday (No Holiday=0, Holiday = 1)
for i in range(len(X_L)):
  if X_L[i,-1] =="No Holiday":
     X_L[i,-1] = 0.0
  else:
    X_L[i,-1] = 1.0

print(X_L[1])
"""
import collections, numpy
chek_H= X_L[:,-1]
collections.Counter(chek_H)
"""

[0.0 0.0 0.0 1.0 0.0 1.0 204.0 1.0 -5.5 38.0 0.8 2000.0 -17.6 0.0 0.0 0.0
 0.0]


'\nimport collections, numpy\nchek_H= X_L[:,-1]\ncollections.Counter(chek_H)\n'

In [9]:
# All number into float for further 'exp' caculation 
X_L = X_L[:, :-1]
Y_L = X_L[:, -1]

In [10]:
#Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_L, X_test_L, Y_train_L, Y_test_L = train_test_split(X_L, Y_L, test_size = 0.3, random_state = 1)
print(Y_train_L.shape, X_train_L.shape, Y_test_L.shape, X_test_L.shape)

(6132,) (6132, 16) (2628,) (2628, 16)


In [11]:
# just make the regression result more correctly, no need. 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_L = sc.fit_transform(X_train_L)
X_test_L = sc.transform(X_test_L)

In [12]:
# Add one
X_train_L = np.concatenate((np.ones((len(X_train_L), 1)), X_train_L), axis=1)
X_test_L = np.concatenate((np.ones((len(X_test_L), 1)), X_test_L), axis=1)

In [13]:
# CHECKING
print(X_train_L[:2,:])
print(X_test_L[:2,:])
print(Y_train_L[0:2])
print(Y_test_L[0:2])
print(Y_train_L.shape, X_train_L.shape, Y_test_L.shape, X_test_L.shape)

[[ 1.         -0.57483949 -0.57458841  1.72156931 -0.57910781 -0.18691364
   0.18691364  0.2443009   0.79116445  1.03414826  1.40666502 -1.07006969
  -1.13166338  1.43013904 -0.08175224 -0.13048284 -0.17682281]
 [ 1.         -0.57483949 -0.57458841  1.72156931 -0.57910781 -0.18691364
   0.18691364 -0.05024993 -0.21523499  1.0424484   0.81681177 -0.97453415
   0.92046118  1.25532504  0.46161113 -0.13048284 -0.17682281]]
[[ 1.         -0.57483949  1.7403762  -0.58086537 -0.57910781 -0.18691364
   0.18691364 -0.0132383  -0.50277769  0.44483842  0.22695851 -0.87899861
   0.48496208  0.52566662  0.50785482 -0.13048284 -0.17682281]
 [ 1.          1.73961604 -0.57458841 -0.58086537 -0.57910781 -0.18691364
   0.18691364  0.74549994 -0.35900634  0.88474577  0.66934845 -1.16560523
   0.0791561   1.06530983  0.45005021 -0.13048284 -0.17682281]]
[0.0 0.0]
[0.0 0.0]
(6132,) (6132, 17) (2628,) (2628, 17)


In [14]:
# change to float for exp caculation
X_train_L = X_train_L.astype(float)
X_test_L = X_test_L.astype(float)
Y_train_L = Y_train_L.astype(float)
Y_test_L = Y_test_L.astype(float)

## Y_train_L , X_train_L, Y_test_L, X_test_L

In [15]:
def sigmoid(x): #Hypothesis Values = y_hat
    return 1.0 / (1 + np.exp(-x))

def cost_funtion_L(X,Y,B):
    ln = X.dot(B)
    m = len(X)
    h = sigmoid(ln)
    cost = (np.log(h)@Y+np.log(1-h)@(1-Y))/(-m)
    return cost

In [16]:
def L_gradient_descent(X, Y, B, alpha, iterations):
    cost_history = [0] * iterations
    ln_history = [0] * iterations
    h_history = [0] * iterations
    error_history = [0]*iterations
    m = len(Y)

    for iteration in range(iterations):
        #print(iteration)
        #Hypothesis Values = y_hat
        ln = X.dot(B)
        ln_history[iteration] = ln
        h = sigmoid(ln)
        h_history[iteration] = h
        # Difference b/w Hypothesis and Actual Y
        error = h-Y
        error_history[iteration] = error
        # Gradient Calculation
        # gradient = X.T.dot(error) / m
        # Changing Values of B using Gradient
        B = B-alpha*X.T.dot(error)/m
        # New Cost Value
        cost = cost_funtion_L(X, Y, B)
        cost_history[iteration] = cost

    return B, cost_history, h_history, ln_history, error_history

In [17]:
B = np.zeros(X_train_L.shape[1])
iterations = 1000
newB_L, cost_history_L,h_history, ln_history,error_history = L_gradient_descent(X_train_L,Y_train_L,B,0.001,iterations)
#newB
newB_L = np.around(newB_L, decimals=2)
newB_L

array([-0.37, -0.01, -0.03, -0.02,  0.07, -0.01,  0.01, -0.04, -0.  ,
       -0.06,  0.04, -0.  , -0.04, -0.04, -0.02, -0.  ,  0.38])

In [46]:
X_test_pred = X_test_L.dot(newB_L)
X_test_pred = X_test_pred.tolist()
qwe = len(X_test_pred)
probability = []
for i in range(qwe):
    if X_test_pred[i] > 0.5:
        probability.append(1)
    else:
        probability.append(0)

print("1 is {} and 0 is {}".format(probability.count(1),probability.count(0)))

probability = np.asarray(probability)

print( probability, probability.shape,Y_test_L.shape)


1 is 90 and 0 is 2538
[0 0 0 ... 1 0 0] (2628,) (2628,)


In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score
probability = probability.astype(int)
Y_test_L = Y_test_L.astype(int)
cm = confusion_matrix(Y_test_L, probability)
print(cm)
accuracy_score(Y_test_L, probability)

[[2538   30    0    0    0    0    0    0    0]
 [   0   17    0    0    0    0    0    0    0]
 [   0   24    0    0    0    0    0    0    0]
 [   0   11    0    0    0    0    0    0    0]
 [   0    4    0    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0    0]]


0.9722222222222222

## train set

In [None]:
cost_history_L[-1]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig,ax = plt.subplots(figsize=(12,8))

ax.set_ylabel('J(Theta)')
ax.set_xlabel('Iterations')
_=ax.plot(range(iterations),cost_history_L,'b.')


In [None]:
fig,ax = plt.subplots(figsize=(12,8))

ax.set_ylabel('h(x)')
ax.set_xlabel('x')
_=ax.plot(ln_history,h_history,'r.')