In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [2]:
file = pd.read_csv("C:\\Users\\mjjyo\\OneDrive\\Desktop\\london.csv")

In [3]:
file

Unnamed: 0,rental_id,duration,bike_id,end_rental_date_time,end_station_id,end_station_name,start_rental_date_time,start_station_id,start_station_name
0,61343322,60.0,12871.0,2016-12-28 00:01:00,660.0,"West Kensington Station, West Kensington",2016-12-28 00:00:00,633,"Vereker Road North, West Kensington"
1,61343321,300.0,2837.0,2016-12-28 00:05:00,763.0,"Mile End Park Leisure Centre, Mile End",2016-12-28 00:00:00,531,"Twig Folly Bridge, Mile End"
2,61343323,360.0,1269.0,2016-12-28 00:06:00,99.0,"Old Quebec Street, Marylebone",2016-12-28 00:00:00,116,"Little Argyll Street, West End"
3,61343325,1140.0,4208.0,2016-12-28 00:20:00,468.0,"Cantrell Road, Bow",2016-12-28 00:01:00,443,"Philpot Street, Whitechapel"
4,61343324,,1406.0,,,,2016-12-28 00:01:00,319,"Baldwin Street, St. Luke's"
...,...,...,...,...,...,...,...,...,...
38215555,101367955,240.0,15436.0,2020-09-01 23:53:00,652.0,"Evesham Street, Avondale",2020-09-01 23:49:00,606,"Addison Road, Holland Park"
38215556,101367958,60.0,10896.0,2020-09-01 23:51:00,488.0,"Reardon Street, Wapping",2020-09-01 23:50:00,458,"Wapping Lane, Wapping"
38215557,101367970,480.0,18116.0,2020-09-01 23:59:00,442.0,"Walmer Road, Avondale",2020-09-01 23:51:00,442,"Walmer Road, Avondale"
38215558,101367999,300.0,16371.0,2020-09-01 23:59:00,511.0,"Sutton Street, Shadwell",2020-09-01 23:54:00,202,"Leman Street, Aldgate"


In [4]:
# Convert date columns to datetime format
file['start_rental_date_time'] = pd.to_datetime(file['start_rental_date_time'])
file['end_rental_date_time'] = pd.to_datetime(file['end_rental_date_time'])


In [5]:
# Group dataset by date and station
rides_per_day = file.groupby([file['end_rental_date_time'].dt.date, 'start_station_name']).size().reset_index(name='rides_count')

In [6]:
rides_per_day

Unnamed: 0,end_rental_date_time,start_station_name,rides_count
0,2016-12-28,"Abbey Orchard Street, Westminster",19
1,2016-12-28,"Abbotsbury Road, Holland Park",9
2,2016-12-28,"Aberdeen Place, St. John's Wood",8
3,2016-12-28,"Abingdon Green, Westminster",39
4,2016-12-28,"Abingdon Villas, Kensington",10
...,...,...,...
1044680,2020-09-01,"Wren Street, Holborn",33
1044681,2020-09-01,"Wright's Lane, Kensington",66
1044682,2020-09-01,"Wynne Road, Stockwell",12
1044683,2020-09-01,"York Hall, Bethnal Green",39


In [7]:
# Calculate average rides per day for each station
average_rides = rides_per_day.groupby('start_station_name')['rides_count'].mean().reset_index(name='average_rides')

In [8]:
average_rides

Unnamed: 0,start_station_name,average_rides
0,"Abbey Orchard Street, Westminster",42.817774
1,"Abbotsbury Road, Holland Park",13.040816
2,"Aberdeen Place, St. John's Wood",20.044676
3,"Aberfeldy Street, Poplar",7.011381
4,"Abingdon Green, Westminster",53.360421
...,...,...
834,"Wright's Lane, Kensington",50.958302
835,"Wynne Road, Stockwell",13.674893
836,"York Hall, Bethnal Green",43.179315
837,"York Way, Camden",1.000000


In [11]:
# Merge average rides with the daily ride counts dataset
rides_per_day = pd.merge(rides_per_day, average_rides, on='start_station_name')

In [13]:
# Calculate the difference between actual ride counts and average rides per day
rides_per_day['rides_diff'] = rides_per_day['rides_count'] - rides_per_day['average_rides']

In [14]:
rides_per_day

Unnamed: 0,end_rental_date_time,start_station_name,rides_count,average_rides,rides_diff
0,2016-12-28,"Abbey Orchard Street, Westminster",19,42.817774,-23.817774
1,2016-12-29,"Abbey Orchard Street, Westminster",34,42.817774,-8.817774
2,2016-12-30,"Abbey Orchard Street, Westminster",21,42.817774,-21.817774
3,2016-12-31,"Abbey Orchard Street, Westminster",27,42.817774,-15.817774
4,2017-01-01,"Abbey Orchard Street, Westminster",38,42.817774,-4.817774
...,...,...,...,...,...
1044680,2020-08-28,"Exhibition Road Museums 2, South Kensington",28,34.142857,-6.142857
1044681,2020-08-29,"Exhibition Road Museums 2, South Kensington",20,34.142857,-14.142857
1044682,2020-08-30,"Exhibition Road Museums 2, South Kensington",24,34.142857,-10.142857
1044683,2020-08-31,"Exhibition Road Museums 2, South Kensington",27,34.142857,-7.142857


In [15]:
# Select relevant features for anomaly detection
features = ['rides_diff']

In [17]:
# Train the anomaly detection model (Isolation Forest)
model = IsolationForest(contamination=0.01)  # Adjust the contamination parameter as needed
model.fit(rides_per_day[features])



IsolationForest(contamination=0.01)

In [18]:
# Predict anomalies
predictions = model.predict(rides_per_day[features])

In [19]:
# Add anomaly predictions to the rides_per_day dataframe
rides_per_day['anomaly_label'] = predictions

In [20]:
rides_per_day

Unnamed: 0,end_rental_date_time,start_station_name,rides_count,average_rides,rides_diff,anomaly_label
0,2016-12-28,"Abbey Orchard Street, Westminster",19,42.817774,-23.817774,1
1,2016-12-29,"Abbey Orchard Street, Westminster",34,42.817774,-8.817774,1
2,2016-12-30,"Abbey Orchard Street, Westminster",21,42.817774,-21.817774,1
3,2016-12-31,"Abbey Orchard Street, Westminster",27,42.817774,-15.817774,1
4,2017-01-01,"Abbey Orchard Street, Westminster",38,42.817774,-4.817774,1
...,...,...,...,...,...,...
1044680,2020-08-28,"Exhibition Road Museums 2, South Kensington",28,34.142857,-6.142857,1
1044681,2020-08-29,"Exhibition Road Museums 2, South Kensington",20,34.142857,-14.142857,1
1044682,2020-08-30,"Exhibition Road Museums 2, South Kensington",24,34.142857,-10.142857,1
1044683,2020-08-31,"Exhibition Road Museums 2, South Kensington",27,34.142857,-7.142857,1


In [29]:
# View the anomalies for the specific station
station_name = 'Abbey Orchard Street, Westminster'  # Replace with the desired station name
station_anomalies = rides_per_day[(rides_per_day['start_station_name'] == station_name) & (rides_per_day['anomaly_label'] == -1)]
station_anomalies

Unnamed: 0,end_rental_date_time,start_station_name,rides_count,average_rides,rides_diff,anomaly_label
727,2018-12-25,"Abbey Orchard Street, Westminster",192,42.817774,149.182226,-1


In [33]:
# Filter and return the DataFrame of anomalies
positive_anomalies = rides_per_day[(rides_per_day['rides_diff'] > 0) & (rides_per_day['anomaly_label'] == -1)]
positive_anomalies

Unnamed: 0,end_rental_date_time,start_station_name,rides_count,average_rides,rides_diff,anomaly_label
727,2018-12-25,"Abbey Orchard Street, Westminster",192,42.817774,149.182226,-1
4721,2018-12-25,"Abingdon Green, Westminster",227,53.360421,173.639579,-1
5082,2019-12-25,"Abingdon Green, Westminster",198,53.360421,144.639579,-1
5239,2020-05-30,"Abingdon Green, Westminster",166,53.360421,112.639579,-1
5240,2020-05-31,"Abingdon Green, Westminster",227,53.360421,173.639579,-1
...,...,...,...,...,...,...
1044429,2020-05-31,"Queensway, Kensington Gardens",207,92.434343,114.565657,-1
1044442,2020-06-13,"Queensway, Kensington Gardens",520,92.434343,427.565657,-1
1044443,2020-06-14,"Queensway, Kensington Gardens",531,92.434343,438.565657,-1
1044452,2020-06-23,"Queensway, Kensington Gardens",224,92.434343,131.565657,-1


In [34]:
# Filter and return the DataFrame of anomalies with negative rides_diff and anomaly label = -1
negative_anomalies = rides_per_day[(rides_per_day['rides_diff'] < 0) & (rides_per_day['anomaly_label'] == -1)]
negative_anomalies

Unnamed: 0,end_rental_date_time,start_station_name,rides_count,average_rides,rides_diff,anomaly_label
17382,2017-01-01,"Albert Gate, Hyde Park",38,159.143830,-121.143830,-1
17383,2017-01-02,"Albert Gate, Hyde Park",39,159.143830,-120.143830,-1
17384,2017-01-03,"Albert Gate, Hyde Park",53,159.143830,-106.143830,-1
17387,2017-01-06,"Albert Gate, Hyde Park",59,159.143830,-100.143830,-1
17391,2017-01-10,"Albert Gate, Hyde Park",60,159.143830,-99.143830,-1
...,...,...,...,...,...,...
1043129,2020-04-21,"Westminster Pier, Westminster",13,107.452888,-94.452888,-1
1043136,2020-04-28,"Westminster Pier, Westminster",11,107.452888,-96.452888,-1
1043137,2020-04-29,"Westminster Pier, Westminster",19,107.452888,-88.452888,-1
1044367,2020-03-30,"Queensway, Kensington Gardens",3,92.434343,-89.434343,-1
