In [1]:


import os
import pandas as pd
import numpy as np
import glob
import requests
from datetime import datetime, timedelta
from time import strptime

#from parkwhere import extract_all_features
#from parkwhereviz import plot_bar, plot_heatmap
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [2]:
# Load CSV file as Pandas DataFrame
col_Names=["Date", "occupancy"]
df = pd.read_csv("FrieburgamBahnhof.csv", names=col_Names)

In [3]:
#3. Feature engineering
# Convert `date_time` column into a datetime column
df['date_time'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')

# See data types of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550203 entries, 0 to 550202
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       550203 non-null  object        
 1   occupancy  550203 non-null  int64         
 2   date_time  550203 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 12.6+ MB


In [4]:
df.head()

Unnamed: 0,Date,occupancy,date_time
0,2016-06-21T08:05:46,39,2016-06-21 08:05:46
1,2016-06-21T08:10:44,34,2016-06-21 08:10:44
2,2016-06-21T08:15:09,30,2016-06-21 08:15:09
3,2016-06-21T08:20:02,29,2016-06-21 08:20:02
4,2016-06-21T08:25:04,29,2016-06-21 08:25:04


In [5]:
df.drop("Date",axis=1,inplace=True)

In [6]:
# Create new features from `date_time` column
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['day_of_week'] = df['date_time'].dt.weekday
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
df['seconds'] = df['date_time'].dt.second

In [7]:
df.head()

Unnamed: 0,occupancy,date_time,year,month,day,day_of_week,hour,minute,seconds
0,39,2016-06-21 08:05:46,2016,6,21,1,8,5,46
1,34,2016-06-21 08:10:44,2016,6,21,1,8,10,44
2,30,2016-06-21 08:15:09,2016,6,21,1,8,15,9
3,29,2016-06-21 08:20:02,2016,6,21,1,8,20,2
4,29,2016-06-21 08:25:04,2016,6,21,1,8,25,4


In [8]:
df['date'] = df['date_time'].dt.date
# df['time'] = df['date_time'].dt.strftime('%H:%M:%S')
# df['hour_min'] = round(df['hour'] + (df['minute'] / 60), 1)


In [9]:
df.head()

Unnamed: 0,occupancy,date_time,year,month,day,day_of_week,hour,minute,seconds,date
0,39,2016-06-21 08:05:46,2016,6,21,1,8,5,46,2016-06-21
1,34,2016-06-21 08:10:44,2016,6,21,1,8,10,44,2016-06-21
2,30,2016-06-21 08:15:09,2016,6,21,1,8,15,9,2016-06-21
3,29,2016-06-21 08:20:02,2016,6,21,1,8,20,2,2016-06-21
4,29,2016-06-21 08:25:04,2016,6,21,1,8,25,4,2016-06-21


In [10]:
year = list(range(2016,2022,1))
for y in year:   
    print("year",y,len(df[df.date_time.astype(str).str.contains(str(y))]['month'].unique()))

year 2016 7
year 2017 12
year 2018 12
year 2019 12
year 2020 12
year 2021 11


In [11]:
# Convert `month` to categorical
# df['month'].replace({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
#                      7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}, inplace=True)
# df['month'] = df['month'].astype('category') 
# df['month'].cat.set_categories(new_categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
#                                                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], ordered=True, inplace=True)

In [12]:
df_2016 = df.query('year==2016')
df_2017 = df.query('year==2017')
df_2018 = df.query('year==2018')
df_2019 = df.query('year==2019')
df_2020 = df.query('year==2020')
df_2021 = df.query('year==2021')



In [13]:
df_2016_g = df_2016.groupby('month')
df_2017_g = df_2017.groupby('month')
df_2018_g = df_2018.groupby('month')
df_2019_g = df_2019.groupby('month')
df_2020_g = df_2020.groupby('month')
df_2021_g = df_2021.groupby('month')

In [14]:
import warnings
warnings.filterwarnings("ignore")
d1 = dict(df_2016_g.sum()['occupancy'])
d2 = dict(df_2017_g.sum()['occupancy'])
d3 = dict(df_2018_g.sum()['occupancy'])
d4 = dict(df_2019_g.sum()['occupancy'])
d5 = dict(df_2020_g.sum()['occupancy'])
d6 = dict(df_2021_g.sum()['occupancy'])


In [15]:
sum(df_2016.query("month=='Oct'")['occupancy'])

0

In [16]:
df_2016.shape

(48816, 10)

In [17]:
year_list = list(range(2016,2022,1))
d_year={}
for yr in year_list:
    temp_df  = df[df['year']==yr]
    d_year[yr] = sum(temp_df['occupancy'])
    print(temp_df.shape)

     
    


(48816, 10)
(104921, 10)
(100379, 10)
(104814, 10)
(97391, 10)
(93882, 10)


In [18]:
d_year

{2016: 2794016,
 2017: 6280017,
 2018: 6817636,
 2019: 9611945,
 2020: 14399178,
 2021: 14464619}

In [19]:
sum(d_year.values())/10000000

5.4367411

In [20]:
# import plotly.express as px
# import datetime
# import chart_studio.plotly as py
# import chart_studio
# import plotly.graph_objects as go
# # Add my username
# username = 'Alisha Parveen' 

# # Add my api key
# api_key = 'j6vY7DJ6EPjEiEjxrIsj'  

# # Set up connection to chart studio
# chart_studio.tools.set_credentials_file(username = username, api_key = api_key)
# def ShowPie(values,keys,title,monthinteger):
#     fig = px.pie(values=values, names=keys, title=title)
# #     print(monthinteger)


    
#     if(monthinteger<13):
#         month = datetime.date(int(title[5:]), monthinteger, 1).strftime('%B')
#         #     print(month)
#         print(f"Maximum Parking in {month}")
#     else:
#         print(f"Maximum Parking in {monthinteger}")
        
    
#     fig.show()
# #     fig.savfig("Machine_learning_nlp_Data_Science_Deep_learning_data_science_python.png")
# ShowPie(d1.values(),d1.keys(),"Year 2016",max(d1, key=d1.get))

In [21]:
# ShowPie(d_year.values(),d_year.keys(),"Year 2021",max(d_year, key=d_year.get))

In [22]:
# import matplotlib.pyplot as plt

# fig = px.bar( x=d_year.keys(), y=d_year.values())
# fig.update_layout(
#     title="Occupancy per Year",
#     xaxis_title="Occupancy",
#     yaxis_title="Year",
#     legend_title="Occupancy per Year",
#     font=dict(
#         family="Courier New, monospace",
#         size=18,
# #         color="RebeccaPurple"
#     )
# )
# fig.show()

In [23]:
# Convert `day_of_week` to categorical
# df['day_of_week'].replace({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}, inplace=True)
# df['day_of_week'] = df['day_of_week'].astype('category') 
# df['day_of_week'].cat.set_categories(new_categories=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], 
#                                      ordered=True, inplace=True)

In [24]:
df.head(20)

Unnamed: 0,occupancy,date_time,year,month,day,day_of_week,hour,minute,seconds,date
0,39,2016-06-21 08:05:46,2016,6,21,1,8,5,46,2016-06-21
1,34,2016-06-21 08:10:44,2016,6,21,1,8,10,44,2016-06-21
2,30,2016-06-21 08:15:09,2016,6,21,1,8,15,9,2016-06-21
3,29,2016-06-21 08:20:02,2016,6,21,1,8,20,2,2016-06-21
4,29,2016-06-21 08:25:04,2016,6,21,1,8,25,4,2016-06-21
5,32,2016-06-21 08:30:10,2016,6,21,1,8,30,10,2016-06-21
6,31,2016-06-21 08:35:53,2016,6,21,1,8,35,53,2016-06-21
7,27,2016-06-21 08:40:53,2016,6,21,1,8,40,53,2016-06-21
8,25,2016-06-21 08:45:02,2016,6,21,1,8,45,2,2016-06-21
9,20,2016-06-21 08:50:50,2016,6,21,1,8,50,50,2016-06-21


#Deadling with Occupancy 

In [25]:
df['occupancy'] = df['occupancy'].apply(lambda x:int(x>=100))

In [26]:
# 1------------>greater than 100
# 0------------>less than 100


In [27]:
print(f"less than {len(df[df['occupancy']==0]['occupancy'])}")
print(f"less than {len(df[df['occupancy']==1]['occupancy'])}")

less than 288409
less than 261794


In [28]:
# Convert `hour` to categorical
# df['hour'] = df['hour'].astype('category') 
# df['hour'].cat.set_categories(new_categories=list(range(24)), ordered=True, inplace=True)

In [29]:
df.head(5)

Unnamed: 0,occupancy,date_time,year,month,day,day_of_week,hour,minute,seconds,date
0,0,2016-06-21 08:05:46,2016,6,21,1,8,5,46,2016-06-21
1,0,2016-06-21 08:10:44,2016,6,21,1,8,10,44,2016-06-21
2,0,2016-06-21 08:15:09,2016,6,21,1,8,15,9,2016-06-21
3,0,2016-06-21 08:20:02,2016,6,21,1,8,20,2,2016-06-21
4,0,2016-06-21 08:25:04,2016,6,21,1,8,25,4,2016-06-21


In [30]:
df.columns
# datetime , date ,time,day_of_week,h

Index(['occupancy', 'date_time', 'year', 'month', 'day', 'day_of_week', 'hour',
       'minute', 'seconds', 'date'],
      dtype='object')

In [31]:
from datetime import datetime, timedelta


In [32]:
def parse_date(date_string):
    """Parse a date string in YYYY-MM-DD format as a datetime object."""
    
    return datetime.strptime(date_string, '%Y-%m-%d').date()

In [33]:
import pandas as pd
from datetime import date
import holidays
from workalendar.europe import Germany
holiday_list = []
for holiday in holidays.Germany(years=[2021, 2020, 2019,2018,2017,2016]).items():
    holiday_list.append(holiday)

holidays_df = pd.DataFrame(holiday_list, columns=["date", "holiday"])

In [34]:
len(holidays_df)

55

In [35]:
holidays_df

Unnamed: 0,date,holiday
0,2016-01-01,Neujahr
1,2016-03-25,Karfreitag
2,2016-03-28,Ostermontag
3,2016-05-01,Erster Mai
4,2016-05-05,Christi Himmelfahrt
5,2016-05-16,Pfingstmontag
6,2016-10-03,Tag der Deutschen Einheit
7,2016-12-25,Erster Weihnachtstag
8,2016-12-26,Zweiter Weihnachtstag
9,2017-01-01,Neujahr


In [36]:
#df['ph'] = np.where(df['date'].isin(holidays_df['date']), 'ph', '')
#df['ph'].replace('', 'nil', inplace=True)

In [37]:
df2 = pd.merge(df, holidays_df, how='left', on='date')

In [38]:
df2['holiday'].str.contains('Neujahr').sum()

1106

In [39]:
df2

Unnamed: 0,occupancy,date_time,year,month,day,day_of_week,hour,minute,seconds,date,holiday
0,0,2016-06-21 08:05:46,2016,6,21,1,8,5,46,2016-06-21,
1,0,2016-06-21 08:10:44,2016,6,21,1,8,10,44,2016-06-21,
2,0,2016-06-21 08:15:09,2016,6,21,1,8,15,9,2016-06-21,
3,0,2016-06-21 08:20:02,2016,6,21,1,8,20,2,2016-06-21,
4,0,2016-06-21 08:25:04,2016,6,21,1,8,25,4,2016-06-21,
...,...,...,...,...,...,...,...,...,...,...,...
550198,1,2021-08-05 19:15:07,2021,8,5,3,19,15,7,2021-08-05,
550199,1,2021-08-05 19:30:07,2021,8,5,3,19,30,7,2021-08-05,
550200,1,2021-08-05 19:45:01,2021,8,5,3,19,45,1,2021-08-05,
550201,1,2021-08-05 19:55:08,2021,8,5,3,19,55,8,2021-08-05,


In [40]:
#Select and rearrange columns
df2 = df2[['year',  'month', 'day', 'day_of_week', 'hour', 
         'minute', 'seconds', 'holiday', 'occupancy']] 

# Show first 5 rows
df2.head()

Unnamed: 0,year,month,day,day_of_week,hour,minute,seconds,holiday,occupancy
0,2016,6,21,1,8,5,46,,0
1,2016,6,21,1,8,10,44,,0
2,2016,6,21,1,8,15,9,,0
3,2016,6,21,1,8,20,2,,0
4,2016,6,21,1,8,25,4,,0


In [41]:
df2['holiday'].str.contains('Neujahr').sum()

1106

In [42]:
df2['holiday'] = df2['holiday'].fillna('no')

In [43]:
df2['holiday'] = df2['holiday'].apply(lambda x:int(x!="no"))

# dropping holiday and day_of_week

In [44]:
# df2.drop(["holiday","day_of_week"],axis=1,inplace=True)

In [45]:
df2

Unnamed: 0,year,month,day,day_of_week,hour,minute,seconds,holiday,occupancy
0,2016,6,21,1,8,5,46,0,0
1,2016,6,21,1,8,10,44,0,0
2,2016,6,21,1,8,15,9,0,0
3,2016,6,21,1,8,20,2,0,0
4,2016,6,21,1,8,25,4,0,0
...,...,...,...,...,...,...,...,...,...
550198,2021,8,5,3,19,15,7,0,1
550199,2021,8,5,3,19,30,7,0,1
550200,2021,8,5,3,19,45,1,0,1
550201,2021,8,5,3,19,55,8,0,1


In [46]:
df2.shape

(550203, 9)

In [47]:
y=df2.iloc[:,6].values
x = df2.drop('occupancy',axis=1).values





In [48]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x = std.fit_transform(x)

In [49]:
x.shape

(550203, 8)

In [50]:
x

array([[-1.69814385, -0.26044284,  0.59329428, ..., -1.30660532,
         2.16318831, -0.15683154],
       [-1.69814385, -0.26044284,  0.59329428, ..., -1.01691719,
         2.02018591, -0.15683154],
       [-1.69814385, -0.26044284,  0.59329428, ..., -0.72722906,
        -0.48235615, -0.15683154],
       ...,
       [ 1.46028695,  0.33449377, -1.22419294, ...,  1.01089969,
        -1.05436576, -0.15683154],
       [ 1.46028695,  0.33449377, -1.22419294, ...,  1.59027594,
        -0.55385735, -0.15683154],
       [ 1.46028695,  0.33449377, -1.22419294, ..., -1.30660532,
        -1.05436576, -0.15683154]])

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 42, test_size = 0.2)

In [52]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import seaborn as sns
# svc = SVC()


In [53]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
dt  =DecisionTreeClassifier()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()

# model = 
lr = LogisticRegression()
models = [lr,rf,knn,dt]
accuracy = []
for model in models:
    model.fit(x_train, y_train)
    # rfc.score(x_train, y_train)
    y_predict = model.predict(x_test)
    accuracy.append(accuracy_score(y_predict,y_test))
#     print("*"*20,model,"*"*20)
#     print(f'+++++++++++++ {"="*20} Classification Report  {model} {"="*20}  +++++++++++++++++++')
#     print(classification_report(y_test,y_predict))

#     print(f'+++++++++++++ {"="*20} Confusion Metrix of {model} {"="*20}  +++++++++++++++++++')

#     cm = confusion_matrix(y_test, y_predict) 

#     # # Transform to df for easier plotting
#     cm_df = pd.DataFrame(cm,
#                          index = ['No','Yes',], 
#                          columns = ['No','Yes'])

#     plt.figure(figsize=(5.5,4))
#     sns.heatmap(cm_df, annot=True)
#     plt.title(model)
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.show()


  
 



KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
model_ev = pd.DataFrame({'Model': ['Logistic Regression', 'Random Forest',
                    'K-Nearest Neighbour','Decision Tree'], 'Accuracy': accuracy})
model_ev


In [None]:
 
plt.figure(figsize=(12,5))
import plotly.express as px
 
fig = px.bar( model_ev, x=model_ev['Model'], y=model_ev['Accuracy'],color = 'Accuracy')

plt.savefig("output_no1w.jpg")
fig.show()

In [None]:
feature_scores = pd.Series(rf.feature_importances_, index=df2.columns[0:8]).sort_values(ascending=False)

feature_scores

# feature importance

In [None]:

f, ax = plt.subplots(figsize=(30, 24))
ax = sns.barplot(x=feature_scores, y=feature_scores.index, data=df)
ax.set_title("Visualize feature scores of the features")
ax.set_yticklabels(feature_scores.index)
ax.set_xlabel("Feature importance score")
ax.set_ylabel("Features")
plt.show()

In [None]:
# year	month	day	day_of_week	hour	minute	seconds	holiday

In [None]:
import plotly.express as px
fig, axs = plt.subplots(2,2, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
# for i in range(10):


    
def Probabilty_Date_Range(date,dow,starting_time,ending_time,holiday):
    minutes_range = list(range(0,60,15))
    output = []
    for n,minute in enumerate(minutes_range):
        
        output = []
        input = [date[0],date[1],date[2],dow,starting_time,minute,0,holiday]
#         print(input)

        output = rf.predict_proba([input]) 
#         print()
#         print(output[0][0])
#         print(output[0][1]) 
        data = [output[0][0], output[0][1]]
        label = ['No', 'Yes']
        x =n
#         plt.subplot(2, 2,x )
        axs[x].pie(data, labels=label, autopct='%1.1f%%', shadow=True, startangle=90)
        if n==0:
            l=["*"*15,starting_time,":0",minute+1,"to",starting_time,":",minute+15,"*"*15]
        elif n==3:
            l=["*"*15,starting_time,":",minute+1,"to",starting_time+1,":","00","*"*15]
            
        else:
            l=["*"*15,starting_time,":",minute+1,"to",starting_time,":",minute+15,"*"*15]
#         axs[n].contourf(np.random.rand(10,10),5,cmap=plt.cm.Oranges)
        axs[x].set_title(str("  ".join(map(str, l))))
#         plt.title('parking')
#         plt.axis('equal')
#         plt.show()
        
#         break
# day of week is 2 Tuesday 

Probabilty_Date_Range([2022,2,1],2,12,1,0)

# Saving Model

In [None]:
!pip install joblib
import joblib
filename = 'finalized_model.sav'
joblib.dump(rf, filename)

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Reading Model

# 

In [None]:
loaded_model = joblib.load(filename)
date = [2022,2,1]
dow = 3
starting =8
ending = 9
minute=15
holiday = 1
seconds=0
date.extend([dow,starting,minute,seconds,holiday])
result = loaded_model.predict_proba([date])

print(result[0][0])
print(result[0][1])