In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from matplotlib import pyplot
from sklearn.feature_selection import chi2
from sklearn.metrics import mean_absolute_error as mae
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import math

In [2]:
tripdata = pd.read_csv("/content/final_bike_sharing.csv")

In [3]:
tripdata.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,humidity,snow,snowdepth,windspeed,visibility,solarradiation,cloudcover,conditions,description,seasons
0,932,2018-01-01 02:06:17.541,2018-01-01 02:21:50.027,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,47.8,0.0,0.1,18.5,16.0,106.7,0.3,Clear,Clear conditions throughout the day.,winter
1,550,2018-01-01 12:06:18.039,2018-01-01 12:15:28.443,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,47.8,0.0,0.1,18.5,16.0,106.7,0.3,Clear,Clear conditions throughout the day.,winter
2,510,2018-01-01 12:06:56.978,2018-01-01 12:15:27.810,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,47.8,0.0,0.1,18.5,16.0,106.7,0.3,Clear,Clear conditions throughout the day.,winter


In [4]:
tripdata.columns

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender', 'dist', 'birthyear', 'date', 'month', 'day',
       'hour', 'min', 'year', 'years_old', 'age_group', 'holiday', 'datetime',
       'tempmax', 'tempmin', 'temp', 'feelslike', 'precip', 'dew', 'humidity',
       'snow', 'snowdepth', 'windspeed', 'visibility', 'solarradiation',
       'cloudcover', 'conditions', 'description', 'seasons'],
      dtype='object')

In [5]:
tripdata=tripdata.drop(['starttime','stoptime'], axis=1)
tripdata.columns

Index(['tripduration', 'start station id', 'start station name',
       'start station latitude', 'start station longitude', 'end station id',
       'end station name', 'end station latitude', 'end station longitude',
       'bikeid', 'usertype', 'birth year', 'gender', 'dist', 'birthyear',
       'date', 'month', 'day', 'hour', 'min', 'year', 'years_old', 'age_group',
       'holiday', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslike',
       'precip', 'dew', 'humidity', 'snow', 'snowdepth', 'windspeed',
       'visibility', 'solarradiation', 'cloudcover', 'conditions',
       'description', 'seasons'],
      dtype='object')

In [6]:
X = tripdata.drop('tripduration', axis=1)
y = tripdata['tripduration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)


Train (26323, 40) (26323,)
Test (6581, 40) (6581,)


In [8]:
feature_names=['start station id', 'start station name',
       'start station latitude', 'start station longitude', 'end station id',
       'end station name', 'end station latitude', 'end station longitude',
       'bikeid', 'usertype', 'birth year', 'gender', 'dist', 'birthyear',
       'date', 'month', 'day', 'hour', 'min', 'year', 'years_old', 'age_group',
       'holiday', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslike',
       'precip', 'dew', 'humidity', 'snow', 'snowdepth', 'windspeed',
       'visibility', 'solarradiation', 'cloudcover', 'conditions',
       'description', 'seasons']

In [11]:
ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe.fit(X_train)
X_train_encoded = ohe.transform(X_train)
X_test_encoded = ohe.transform(X_test)


In [12]:
le = LabelEncoder()     
le.fit_transform(y_train) 

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)

In [13]:
# fitting a RF model
model = RandomForestRegressor()
model.fit(X_train_encoded, y_train_encoded)

# getting forecasts for the test set
preds = model.predict(X_test_encoded)

# computing MAE error
print(mae(y_test_encoded, preds))


84.27708706883452


In [14]:
# getting importance scores from previous model
importance_scores = pd.Series(dict(zip(X_train.columns, model.feature_importances_)))

# getting top 10 features
top_20_features = importance_scores.sort_values(ascending=False)[:20]
top_20_features_nm = top_20_features.index

In [15]:
top_20_features_nm

Index(['start station longitude', 'min', 'solarradiation', 'year',
       'start station id', 'tempmin', 'feelslike', 'start station latitude',
       'gender', 'years_old', 'usertype', 'start station name',
       'end station longitude', 'holiday', 'end station id', 'birth year',
       'month', 'dist', 'snowdepth', 'datetime'],
      dtype='object')

## Univate amputaion of 5% NA on training data

In [None]:
## get the sample size such as 5%, 10%,20% etc
def get_sample_size(df, perc=5):
    return round(perc/100 * df.shape[0])

#get_sample_size(training_data)    

In [None]:
def get_an_index(index):
    li = []
    for i in index:
        li.append(i)
    return li
        
##get_an_index(training_data.index)

In [None]:
random.seed(100)
def replace_nan_values(df,col_name, perc):
    index = get_an_index(df.index)
    sample_size = get_sample_size(df, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        df.at[i,col_name]= np.NaN
    

In [None]:
replace_nan_values(training_data,"tripduration",perc=5)

In [None]:
training_data.isnull().sum()

tripduration               14156
start station id               0
start station latitude         0
start station longitude        0
end station id                 0
end station latitude           0
end station longitude          0
bikeid                         0
usertype                       0
birth year                     0
gender                         0
dist                           0
year                           0
month                          0
day                            0
hour                           0
date                           0
holiday                        0
datetime                       0
tempmax                        0
tempmin                        0
temp                           0
dew                            0
humidity                       0
snow                           0
snowdepth                      0
windspeed                      0
visibility                     0
solarradiation                 0
cloudcover                     0
conditions

## 1. Remove amputed NA values

In [None]:
df = training_data.dropna()

In [None]:
df.isnull().sum()

tripduration               0
start station id           0
start station latitude     0
start station longitude    0
end station id             0
end station latitude       0
end station longitude      0
bikeid                     0
usertype                   0
birth year                 0
gender                     0
dist                       0
year                       0
month                      0
day                        0
hour                       0
date                       0
holiday                    0
datetime                   0
tempmax                    0
tempmin                    0
temp                       0
dew                        0
humidity                   0
snow                       0
snowdepth                  0
windspeed                  0
visibility                 0
solarradiation             0
cloudcover                 0
conditions                 0
description                0
seasons                    0
dtype: int64

## Applay LinearRegression

In [None]:
model = LinearRegression().fit(x, y) 

NameError: name 'LinearRegression' is not defined