### Create Coefficients for each Segment

In [29]:
import pandas as pd
from sklearn import linear_model

In [30]:
# Connect to database

from sqlalchemy import create_engine

URI="localhost"
PORT="5433"
DB = "jetaDb"
USER = "postgres"
PASSWORD = "00001234"
  
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(USER, PASSWORD, URI, PORT, DB), echo=True)

In [31]:
# Making the dataframe that will hold the coefficients for each segment

Coefficients = pd.DataFrame(columns=['segment', 'intercept', 'arrivaltime', 'rain', 
                                    'fri', 'mon', 'sat', 'sun', 'thu', 'tue'])

In [32]:
# Lists to hold the data that will be added to our coefficients dataframe
segment = []
intercept = []
arrivaltime = []
rain = []
holiday = []
fri = []
mon = []
sat = []
sun = []
thu = []
tue = []
wed = []

In [33]:
import os

# Create list of all files in segment directory
segment_files = os.listdir("/home/isaac/Data/SegmentedSamples2_actual_final/")

# Remove .csv from file names
segment_files = [file[:-4] for file in segment_files]

In [34]:
# Column names for the files

colnames = ['segments', 'arrivaltime', 'dayofweek', 'rain', 'holiday', 
            'citycenter', 'distance', 'kalman_time', 'traveltime']

In [35]:
# Running a loop through segments and learning simple linear regression model through each segment

count = 0
hundreds = 0

for file in segment_files:
    fname = "/home/isaac/Data/SegmentedSamples2_actual_final/{}.csv".format(file)
    
    if os.path.isfile(fname):
        # Read the file in a dataframe
        df = pd.read_csv(fname, names = colnames,  delimiter=',')
        
        df.drop(['segments', 'citycenter', 'distance', 'kalman_time'], axis=1, inplace=True)
        
        # Converting the datatypes
        df['dayofweek'] = df['dayofweek'].astype('category')
        
        # Rain had missing values
        df['rain'].fillna(df['rain'].mean(), inplace=True)
        
        # Split the dataset into independent and dependent features
        df_X = df[['arrivaltime', 'dayofweek', 'rain', 'holiday']]
        df_y = df['traveltime']
        
        # Dummify day of week
        weekday_dummies = pd.get_dummies(df_X.dayofweek, prefix='dayofweek')
        
        # Check that segment has each day of the week
        if weekday_dummies.shape[1] != 7:
            #print(file, "has", weekday_dummies.shape[1], "weekdays. Ignoring segment...")
            continue
        
        df_X_dummied = pd.concat([df_X,weekday_dummies],axis=1)
        df_X_dummied = df_X_dummied.drop('dayofweek',axis=1)
        df_X_dummied = df_X_dummied.drop('dayofweek_Wednesday',axis=1) # To avoid dummy variable trap
        
        lm = linear_model.LinearRegression()
        lm.fit(df_X_dummied, df_y)
        
        # Writing the coefficients in the lists
        segment.append(file)
        intercept.append(lm.intercept_)
        arrivaltime.append(lm.coef_[0])
        rain.append(lm.coef_[1])
        holiday.append(lm.coef_[2])
        fri.append(lm.coef_[3])
        mon.append(lm.coef_[4])
        sat.append(lm.coef_[5])
        sun.append(lm.coef_[6])
        thu.append(lm.coef_[7])
        tue.append(lm.coef_[8])
    
    else:
        print("File: {}, not found".format(fname))
        continue
    
    count += 1  
    if count >= 100:
        hundreds += 100
        print(hundreds, end=", ")
        count = 0        
 

100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 

In [36]:
# Convert lists into dataframes
Coefficients['segment'] = segment
Coefficients['intercept'] = intercept
Coefficients['arrivaltime'] = arrivaltime
Coefficients['rain'] = rain
Coefficients['holiday'] = holiday
Coefficients['fri'] = fri
Coefficients['mon'] = mon
Coefficients['sat'] = sat
Coefficients['sun'] = sun
Coefficients['thu'] = thu
Coefficients['tue'] = tue

In [37]:
Coefficients.head(5)

Unnamed: 0,segment,intercept,arrivaltime,rain,fri,mon,sat,sun,thu,tue,holiday
0,1601_1602,50.87603,6.4e-05,-0.430727,1.710725,0.924812,-2.421589,-4.185323,0.108956,0.144921,-0.707968
1,4002_5112,39.283956,2.6e-05,-0.792347,0.942639,-0.404349,-0.79538,7.805118,1.232046,-0.727879,-0.305514
2,1055_2868,80.289912,-0.000356,2.518226,-1.338574,-3.060299,-6.565507,-4.92449,-0.107228,0.074781,-4.131012
3,941_942,36.412443,2.6e-05,0.384811,-0.134059,0.09174,0.247383,0.573497,-0.490828,-0.082765,-0.736437
4,7318_4980,78.228207,-6.7e-05,6.444353,-7.643289,-2.968056,-24.647044,-22.616229,-2.027877,-1.275068,-5.622712


In [38]:
# Write coefficients to database table

Coefficients.rename(columns={'Segment': 'segment'}, inplace=True)

Coefficients.to_sql('main_coefficients', engine, if_exists='append', index=False)

2018-08-14 13:13:43,465 INFO sqlalchemy.engine.base.Engine select version()
2018-08-14 13:13:43,468 INFO sqlalchemy.engine.base.Engine {}
2018-08-14 13:13:43,478 INFO sqlalchemy.engine.base.Engine select current_schema()
2018-08-14 13:13:43,480 INFO sqlalchemy.engine.base.Engine {}
2018-08-14 13:13:43,490 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2018-08-14 13:13:43,493 INFO sqlalchemy.engine.base.Engine {}
2018-08-14 13:13:43,499 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2018-08-14 13:13:43,501 INFO sqlalchemy.engine.base.Engine {}
2018-08-14 13:13:43,508 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2018-08-14 13:13:43,509 INFO sqlalchemy.engine.base.Engine {}
2018-08-14 13:13:43,522 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20