## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read base dataset

In [None]:
data= pd.read_csv('Trips_2018.csv')

### Basic information

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

### Convert data types

In [None]:
date_columns = ["starttime", "stoptime"]
categorical_columns = ["gender", "usertype", "start_station_id", "end_station_id"]
data[date_columns] = data[date_columns].apply(pd.to_datetime)
data[categorical_columns] = data[categorical_columns].apply(lambda x: x.astype('category'))

### Drop unnecesary column

In [None]:
data = data.drop(columns=['Unnamed: 0'])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data = data.sort_values(by='starttime').reset_index(drop=True)

### Add date information

In [None]:
data["pickup_hour"] = data["starttime"].dt.hour
data["pickup_day"] = data["starttime"].dt.date
data["dropoff_hour"] = data["stoptime"].dt.hour
data["dropoff_day"] = data["stoptime"].dt.date

In [None]:
display(data.head())

### Extracting all distinct stations and cleaning

In [None]:
# extract distinct stations
stations_start = data[['start_station_id', 'start_station_latitude', 'start_station_longitude']].drop_duplicates().rename(columns={
    'start_station_id': 'station_id',
    'start_station_latitude': 'station_latitude',
    'start_station_longitude': 'station_longitude'
})

stations_end = data[['end_station_id', 'end_station_latitude', 'end_station_longitude']].drop_duplicates().rename(columns={
    'end_station_id': 'station_id',
    'end_station_latitude': 'station_latitude',
    'end_station_longitude': 'station_longitude'
})

stations = pd.concat([stations_start, stations_end]).drop_duplicates().reset_index(drop=True)

start_stations_set = set(stations_start['station_id'])
end_stations_set = set(stations_end['station_id'])

def categorize_station(station_id):
    in_start = station_id in start_stations_set
    in_end = station_id in end_stations_set
    
    if in_start and in_end:
        return 'both'
    elif in_start:
        return 'start'
    elif in_end:
        return 'end'
    else:
        return 'No station id'  # if station doesn't appear in either

# Add category column to stations DataFrame
stations['category'] = stations['station_id'].apply(categorize_station)


stations.info()

In [None]:
stations.describe()

In [None]:
outlier_stations = stations[stations['station_latitude'] >= 41]
outlier_stations.info()
outlier_stations.head()

In [None]:
# remove outlier based on latitude
stations = stations[stations['station_latitude'] < 41]
stations_start = stations_start[stations_start['station_latitude'] < 41]
stations_end = stations_end[stations_end['station_latitude'] < 41]
stations.describe()


In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.scatterplot(data=stations, x='station_longitude', y='station_latitude', hue='category')
plt.show()

The orange category seems suspicious, they are categorized a s none, because they do not have a station_id, and they have a very regular pattern

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))
sns.scatterplot(data=stations[stations['category'] == 'No station id'], x='station_longitude', y='station_latitude', hue='category')
plt.show()

In [None]:
stations = stations[stations['category'] != 'No station id']

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.scatterplot(data=stations, x='station_longitude', y='station_latitude', hue='category')
plt.show()

In [None]:
stations.info()

Lets use this new knowledge to exclude the trips originating/ending at the outlier and the stations that have no station id and looked like "fake" data

In [None]:
data = data.dropna(subset=['start_station_id', 'end_station_id'])
outlier_ids = outlier_stations['station_id'].tolist()
for value_to_remove in outlier_ids:
    mask_to_keep = (data['start_station_id'] != value_to_remove) & (data['end_station_id'] != value_to_remove)
    data = data[mask_to_keep]

In [None]:
data.info()

### Saving CSVs 

In [None]:
data.to_csv('processed_trips_2018.csv', index=False)
stations.to_csv('processed_stations_2018.csv', index=False)