# Sequential Pattern Mining
## PrefixSpan Algorithm
PrefixSpan extracts frequent sequences with depth-first search **(DFS)** by executing SDB projection operations **`recursively`**.

In [1]:
import pandas as pd
import numpy as np
import sys
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date,timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import psycopg2
import pandas.io.sql as psql
from sqlalchemy import create_engine
import sshtunnel 
import psycopg2.extras
import csv
import re

### 1. Reading and Processing the Dataset

In [2]:
def get_data_regularity():
    
    #SQL statement regularity_trip_timegroups_results
    sql = """select *
from regularity_trip_timegroups_results
order by route_short_name, stop_id, direction_id, from_date_time"""
    
    #database connection

    try:
        sshtunnel.SSH_TIMEOUT = 10.0
        with sshtunnel.SSHTunnelForwarder(
                ('3.83.88.154', 22),
                ssh_username='ec2-user',
                #CHANGE PATH TO rds-example.cer
                ssh_pkey='/Users/lauraforerocamacho/Downloads/rds-example.cer',
                remote_bind_address=('stib-mivb.cjo3cnpt9pci.us-east-1.rds.amazonaws.com', 5432)
        ) as tunnel:
                conn = psycopg2.connect(
                        host='127.0.0.1',
                        port=tunnel.local_bind_port,     
                        database="stibmivbdb",
                        user="postgres",
                        password="stib-mivb-db1")

                cur = conn.cursor()
                #perform sql and write result to dataframe
                cur.execute(sql)
                field_names = [i[0] for i in cur.description]
                query_results = pd.DataFrame(cur.fetchall(),columns=field_names)
                #close connection
                cur.close()
                
                return query_results
    except Exception as e:
            print(e)

In [3]:
def get_defined_intervals():
    
    #SQL statement regularity_trip_timegroups_results
    sql = """select *
from defined_intervals
order by from_time"""
    
    #database connection

    try:
        sshtunnel.SSH_TIMEOUT = 10.0
        with sshtunnel.SSHTunnelForwarder(
                ('3.83.88.154', 22),
                ssh_username='ec2-user',
                #CHANGE PATH TO rds-example.cer
                ssh_pkey='/Users/lauraforerocamacho/Downloads/rds-example.cer',
                remote_bind_address=('stib-mivb.cjo3cnpt9pci.us-east-1.rds.amazonaws.com', 5432)
        ) as tunnel:
                conn = psycopg2.connect(
                        host='127.0.0.1',
                        port=tunnel.local_bind_port,     
                        database="stibmivbdb",
                        user="postgres",
                        password="stib-mivb-db1")

                cur = conn.cursor()
                #perform sql and write result to dataframe
                cur.execute(sql)
                field_names = [i[0] for i in cur.description]
                query_results = pd.DataFrame(cur.fetchall(),columns=field_names)
                #close connection
                cur.close()
                
                return query_results
    except Exception as e:
            print(e)

In [4]:
data=get_data_regularity()
data=data.dropna()
intervals=get_defined_intervals()
intervals["data"]=None
data["ewt"]= data["ewt"].astype(int)
data["stop_id"]= data["stop_id"].astype(str)

data["from_date_time"]= pd.to_datetime(data["from_date_time"])
data["to_date_time"]= pd.to_datetime(data["to_date_time"])

In [5]:
data.head()

Unnamed: 0,route_id,route_short_name,stop_id,direction_id,from_date_time,to_date_time,from_time,to_time,from_date,to_date,...,thursday,friday,saturday,sunday,route_type,waiting_time_real,waiting_time_scheduled,ewt,ewt_minutes,ewt_category
5,2,1,8011,1,2021-09-01 05:48:57,2021-09-17 20:24:07,05:48:57,20:24:07,2021-09-01,2021-09-17,...,1,1,0,0,1,232.871609,39.160125,193,3.228525,0-4MIN
6,2,1,8011,1,2021-09-01 20:34:07,2021-09-17 23:48:53,20:34:07,23:48:53,2021-09-01,2021-09-17,...,1,1,0,0,1,410.917755,145.023148,265,4.431577,4-8MIN
7,2,1,8011,1,2021-09-04 06:44:23,2021-09-18 23:48:53,06:44:23,23:48:53,2021-09-04,2021-09-18,...,0,0,1,0,1,259.528527,57.318244,202,3.370171,0-4MIN
8,2,1,8011,1,2021-09-12 08:42:04,2021-09-12 23:48:53,08:42:04,23:48:53,2021-09-12,2021-09-12,...,0,0,1,0,1,286.690589,54.203186,232,3.87479,0-4MIN
9,2,1,8011,1,2021-09-19 06:44:23,2021-09-19 23:48:53,06:44:23,23:48:53,2021-09-19,2021-09-19,...,0,0,1,0,1,308.359592,43.434227,264,4.415423,4-8MIN


In [6]:
data["inter"]=None
data["inter_day"]=None

data_final=[]
data_final_col=data.columns
for i in range(0, len(intervals)):
    c_interval=intervals.iloc[i].to_dict()
    temp=[]
    find_elements_range=data[(data["from_date_time"].dt.time<=c_interval["from_time"].time())&(c_interval["to_time"].time()<=data["to_date_time"].dt.time)&(data["from_date_time"].dt.date<=c_interval["from_time"].date())&(c_interval["to_time"].date()<=data["to_date_time"].dt.date)&(data["ewt"]>(0))]
    for j in find_elements_range.to_dict(orient="records"):
        copy=j
        copy["inter_day"]=c_interval["from_time"].date().day
        copy["inter"]=c_interval["from_time"]
        data_final.append(copy)
        
data=pd.DataFrame(data_final, columns=data_final_col)

In [None]:
data=data[data["ewt"]>(0)]

In [None]:
data=data[["route_id","stop_id","inter", "inter_day","ewt"]]

In [None]:
data.to_csv("./data_final.csv")