# Imports and Functions

In [5]:
import pandas as pd

# Trick Data

In [6]:
df_cb = pd.read_feather("data/citibike_data_201907-201909.feather")
df_data = df_cb.sample(10**5, random_state=0)
df_cb.shape
del df_cb

(6970188, 15)

In [53]:
df_data = pd.read_feather("data/citibike_trick_data.feather")
df_data = df_data.reset_index(drop=True)

In [13]:
df_data.columns

Index(['tripduration', 'starttime', 'stoptime', 'start_station_id',
       'start_station_name', 'start_station_latitude',
       'start_station_longitude', 'end_station_id', 'end_station_name',
       'end_station_latitude', 'end_station_longitude', 'bikeid', 'usertype',
       'birth_year', 'gender'],
      dtype='object')

## 1. Negative trip duration times

In [54]:
df_data.loc[:50,"tripduration"] = df_data.loc[:50,"tripduration"].abs()*-1

## 2. Trip duration that does not match the difference between starttime and stoptime

In [55]:
df_data.loc[50:137, "tripduration"] = df_data.loc[50:137, "tripduration"].sample(frac=1, random_state = 1).values

## 3. Switching start and stop times

In [59]:
starttimes = df_data.loc[137:259, "starttime"].tolist()
stoptimes = df_data.loc[137:259, "stoptime"].tolist()
df_data.loc[137:259, "starttime"] = stoptimes
df_data.loc[137:259, "stoptime"] = starttimes

## 4. Start time is the same as the stop time

In [63]:
df_data.loc[259:300, "stoptime"] = df_data.loc[259:300, "starttime"]

## 5. Start and stop times are out of the range of the rest of the data

In [69]:
df_data.loc[300:500, "stoptime"] = df_data.loc[300:500, "stoptime"].str.replace("2019-08", "2019-01")
df_data.loc[300:500, "starttime"] = df_data.loc[300:500, "starttime"].str.replace("2019-08", "2019-01")

## 6. Invalid Stop times

In [88]:
df_data.loc[501:621, "stoptime"] = df_data.loc[501:621, "stoptime"].str.replace(r"(2019-0\d-\d\d) 1(\d)", r"\1 2\2")

## 7. Move some starting and ending locations into the hudson

In [97]:
df_data.loc[621:641, "start_station_id"] = 20000
df_data.loc[621:641, "start_station_name"] = "Hudson River"
df_data.loc[621:641, "start_station_latitude"] = 40.800612
df_data.loc[621:641, "start_station_longitude"] = -73.980261
df_data.loc[641:664, "end_station_id"] = 20000
df_data.loc[641:664, "end_station_name"] = "Hudson River"
df_data.loc[641:664, "end_station_latitude"] = 40.800612
df_data.loc[641:664, "end_station_longitude"] = -73.980261

## 8. Change some of the station names to various versions of null

In [102]:
df_data.loc[665:775, "start_station_name"] = ""
df_data.loc[775:850, "end_station_name"] = ""
df_data.loc[851:920, "start_station_name"] = "None"
df_data.loc[921:998, "end_station_name"] = "None"

## 9. make the same bike be in multiple places at the same time

In [133]:
df_data.loc[1000:1050, "starttime"] = df_data.loc[999, "starttime"]
df_data.loc[1000:1050, "bikeid"] = df_data.loc[999, "bikeid"]

## 10. Give one person a different gender category

In [139]:
df_data.loc[1051, "gender"] = 12

## 11.  Change some of the bike id's to names

In [140]:
df_data.loc[1052:1060, "bikeid"] = "grover"
df_data.loc[1061:1080, "bikeid"] = "l0l0"

# Re-Shuffle the data

In [142]:
df_data_shuff = df_data.sample(frac=1, random_state=2).reset_index(drop=True)

In [145]:
df_data_shuff.to_csv("../downloads/citibike_data.csv")

In [11]:
df_data.reset_index(drop=True).to_feather("data/citibike_trick_data.feather")