<a href="https://colab.research.google.com/github/Homaoa/Regression-for-a-Bike-Sharing-Data-Set/blob/main/Cleaning_and_Preparing_the_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

## Importing the DataSet

In [2]:
url = '/content/Bike DataSet.csv'
df = pd.read_csv(url)

### Checking the DataSet

In [3]:
# First, taking a look at the DataSet.
df.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
0,1,1,0,1,0,6,0,2.0,0.344167,0.363625,0.805833,0.160446,331.0,654.0
1,2,1,0,1,0,0,0,2.0,0.363478,0.353739,0.696087,0.248539,131.0,670.0
2,3,1,0,1,0,1,1,1.0,0.196364,0.189405,0.437273,0.248309,120.0,1229.0
3,4,1,0,1,0,2,1,1.0,0.2,0.212122,0.590435,0.160296,108.0,1454.0
4,5,1,0,1,0,3,1,1.0,0.226957,0.22927,0.436957,0.1869,82.0,1518.0


In [4]:
df.tail()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
726,727,1,1,12,0,4,1,2.0,0.254167,0.226642,0.652917,0.350133,247.0,1867.0
727,728,1,1,12,0,5,1,2.0,0.253333,0.255046,0.59,0.155471,644.0,2451.0
728,729,1,1,12,0,6,0,2.0,0.253333,0.2424,0.752917,0.124383,159.0,1182.0
729,730,1,1,12,0,0,0,1.0,0.255833,0.2317,0.483333,0.350754,364.0,1432.0
730,731,1,1,12,0,1,1,2.0,0.215833,0.223487,0.5775,0.154846,439.0,2290.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   season      731 non-null    int64  
 2   yr          731 non-null    int64  
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    int64  
 5   weekday     731 non-null    int64  
 6   workingday  731 non-null    int64  
 7   weathersit  726 non-null    float64
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         722 non-null    float64
 11  windspeed   731 non-null    float64
 12  casual      724 non-null    float64
 13  registered  724 non-null    float64
dtypes: float64(7), int64(7)
memory usage: 80.1 KB


In [None]:
# We can see that there are 14 attributes. 
# The name of some attributes are not clear.
# Some of these attributes have missing values that should be taken care of.
# Also, the type of some attributes should be changed too.

In [6]:
# I rename the columns to make them more clear.
df = df.rename(columns={
    'instant':'Instant' , 'season':'Season' , 'yr':'Year' , 'mnth':'Month' , 'holiday':'Holiday' ,
    'weekday':'WeekDay' , 'workingday':'WorkingDay' , 'weathersit':'WeatherSituation' , 'temp':'Temperature',
    'atemp':'FeelingTemperature' , 'hum':'Humidity' , 'windspeed':'WindSpeed' , 'casual':'Casual' , 'registered':'Registered'
    })

In [7]:
df.head()

Unnamed: 0,Instant,Season,Year,Month,Holiday,WeekDay,WorkingDay,WeatherSituation,Temperature,FeelingTemperature,Humidity,WindSpeed,Casual,Registered
0,1,1,0,1,0,6,0,2.0,0.344167,0.363625,0.805833,0.160446,331.0,654.0
1,2,1,0,1,0,0,0,2.0,0.363478,0.353739,0.696087,0.248539,131.0,670.0
2,3,1,0,1,0,1,1,1.0,0.196364,0.189405,0.437273,0.248309,120.0,1229.0
3,4,1,0,1,0,2,1,1.0,0.2,0.212122,0.590435,0.160296,108.0,1454.0
4,5,1,0,1,0,3,1,1.0,0.226957,0.22927,0.436957,0.1869,82.0,1518.0


## Missing Values

In [None]:
# As we can see, some variables have missing values.
# For weathersit which is a categorical variable that shows the weather situation, I replace the NaN
# values with the category that is more common than the rest.

In [8]:
df['WeatherSituation'].value_counts()

1.0    460
2.0    245
3.0     21
Name: WeatherSituation, dtype: int64

In [None]:
# The weather situation 1 is more common so I replace the NaN values for this variable with 1.

In [9]:
df['WeatherSituation'] = df['WeatherSituation'].replace(np.nan , 1)

In [10]:
df['WeatherSituation'].value_counts()

1.0    465
2.0    245
3.0     21
Name: WeatherSituation, dtype: int64

In [None]:
# Humidity is also having missing values. Since it is a continuous variable, I use the mean of other observations
# to calculate the value for these missing values. 

In [11]:
Humidity_mean = df['Humidity'].mean()
print(Humidity_mean)

0.6286847119113573


In [None]:
# The values in Humidity column have 6 decimal places. So I should reduce the decimals of this mean
# to make it like the rest of values.

In [12]:
Humidity_mean = round(Humidity_mean , 6)
print(Humidity_mean)
df['Humidity'] = df['Humidity'].replace(np.nan , Humidity_mean)

0.628685


In [None]:
# Casual and Registered (count of casual users and registered users) are the target variables. In order to manage 
# their missing values, I remove the obserations.

In [13]:
df.dropna ( subset = ['Casual'] , axis = 0 , inplace = True)
df.dropna ( subset = ['Registered'] , axis = 0 , inplace = True)

In [14]:
# Now I check the data again.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 724 entries, 0 to 730
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Instant             724 non-null    int64  
 1   Season              724 non-null    int64  
 2   Year                724 non-null    int64  
 3   Month               724 non-null    int64  
 4   Holiday             724 non-null    int64  
 5   WeekDay             724 non-null    int64  
 6   WorkingDay          724 non-null    int64  
 7   WeatherSituation    724 non-null    float64
 8   Temperature         724 non-null    float64
 9   FeelingTemperature  724 non-null    float64
 10  Humidity            724 non-null    float64
 11  WindSpeed           724 non-null    float64
 12  Casual              724 non-null    float64
 13  Registered          724 non-null    float64
dtypes: float64(7), int64(7)
memory usage: 84.8 KB


In [None]:
# The missing values are handled. Now I can fix the data types.

## Correcting the data types

In [None]:
# Season, Year, Month, WeekDay, WeatherSituation are actually categorical variables. 
# Holiday and WorkingDay are binary variables. Either holiday or not, either working day or not.
# Year is also binary, it takes only 0 and 1. The DataSet is for two years and the first year is shown by 0
# in the DataSet and the second year by 1.
# The type pf all these variables are considered numeric which has to be fixed.

In [18]:
df['Season'] = df['Season'] . astype('object')
df['Year'] = df['Year'] . astype('object')
df['Month'] = df['Month'] . astype('object')
df['WeekDay'] = df['WeekDay'] . astype('object')
df['WeatherSituation'] = df['WeatherSituation'] . astype('object')
df['Holiday'] = df['Holiday'] . astype('object')
df['WorkingDay'] = df['WorkingDay'] . astype('object')

In [25]:
# Also, the number of rental bikes (Casual and Registered) are integer not float.
df['Casual'] = df['Casual'] . astype('int')
df['Registered'] = df['Registered'] . astype('int')

In [None]:
# A new variable should be defined as the count of total rental bikes including both casual and registered.
# This variable would be the target or dependant variable that we want the model to predict.

In [26]:
df["TotalBikes"] = df['Casual'] + df['Registered']

In [27]:
df.head()

Unnamed: 0,Instant,Season,Year,Month,Holiday,WeekDay,WorkingDay,WeatherSituation,Temperature,FeelingTemperature,Humidity,WindSpeed,Casual,Registered,TotalBikes
0,1,1,0,1,0,6,0,2.0,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,1,0,1,0,0,0,2.0,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,1,0,1,0,1,1,1.0,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,1,0,1,0,2,1,1.0,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,1,0,1,0,3,1,1.0,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 724 entries, 0 to 730
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Instant             724 non-null    int64  
 1   Season              724 non-null    object 
 2   Year                724 non-null    object 
 3   Month               724 non-null    object 
 4   Holiday             724 non-null    object 
 5   WeekDay             724 non-null    object 
 6   WorkingDay          724 non-null    object 
 7   WeatherSituation    724 non-null    object 
 8   Temperature         724 non-null    float64
 9   FeelingTemperature  724 non-null    float64
 10  Humidity            724 non-null    float64
 11  WindSpeed           724 non-null    float64
 12  Casual              724 non-null    int64  
 13  Registered          724 non-null    int64  
 14  TotalBikes          724 non-null    int64  
dtypes: float64(4), int64(4), object(7)
memory usage: 90.5+ KB

In [None]:
# Now the data is clean and ready for further analysis. So I save it as a new csv file.

In [31]:
df.to_csv('Bike DataSet Cleaned.csv')