### Create Coefficients for each Segment

In [34]:
import pandas as pd
from sklearn import linear_model

In [35]:
# Connect to database

from sqlalchemy import create_engine

URI="localhost"
PORT="5433"
DB = "jetaDb"
USER = "postgres"
PASSWORD = "00001234"
  
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(USER, PASSWORD, URI, PORT, DB), echo=True)

In [36]:
# Making the dataframe that will hold the coefficients for each segment

Coefficients = pd.DataFrame(columns=['segment', 'intercept', 'arrivaltime', 'rain', 'holiday', 
                                    'mon', 'tue', 'thu', 'fri', 'sat', 'sun',])

In [37]:
# Lists to hold the data that will be added to our coefficients dataframe
segment = []
intercept = []
arrivaltime = []
rain = []
holiday = []
fri = []
mon = []
sat = []
sun = []
thu = []
tue = []
wed = []

In [38]:
# Specify directory containing segment files

segments_dir = "/home/isaac/Data/Segments_15th_Aug/"

In [39]:
import os

# Create list of all files in segment directory
segment_files = os.listdir(segments_dir)

# Remove .csv from file names
segment_files = [file[:-4] for file in segment_files]

In [40]:
# Column names for the files

colnames = ['segments', 'arrivaltime', 'dayofweek', 'rain', 'holiday', 'traveltime']

In [41]:
# Running a loop through segments and learning simple linear regression model through each segment

count = 0
hundreds = 0

for file in segment_files:
    filename = segments_dir + "{}.csv".format(file)
    
    if os.path.isfile(filename):
        # Read the file in a dataframe
        df = pd.read_csv(filename, names = colnames,  delimiter=',')
        
        # Converting the datatypes
        df['dayofweek'] = df['dayofweek'].astype('category')
        
        # Rain had missing values
        df['rain'].fillna(df['rain'].mean(), inplace=True)
        
        # Split the dataset into independent and dependent features
        df_X = df[['arrivaltime', 'dayofweek', 'rain', 'holiday']]
        df_y = df['traveltime']
        
        # Dummify day of week
        weekday_dummies = pd.get_dummies(df_X.dayofweek, prefix='dayofweek')
        
        # Check that segment has each day of the week
        if weekday_dummies.shape[1] != 7:
            #print(file, "has", weekday_dummies.shape[1], "weekdays. Ignoring segment...")
            continue
        
        df_X_dummied = pd.concat([df_X, weekday_dummies], axis = 1)
        df_X_dummied = df_X_dummied.drop('dayofweek', axis = 1)
        df_X_dummied = df_X_dummied.drop('dayofweek_Wednesday', axis = 1) # To avoid dummy variable trap
        
        lm = linear_model.LinearRegression()
        try:
            lm.fit(df_X_dummied, df_y)
        except:
            print("Couldn't fit model for: ", filename)
            continue
        
        # Writing the coefficients in the lists
        segment.append(file)
        intercept.append(lm.intercept_)
        arrivaltime.append(lm.coef_[0])
        rain.append(lm.coef_[1])
        holiday.append(lm.coef_[2])
        fri.append(lm.coef_[3])
        mon.append(lm.coef_[4])
        sat.append(lm.coef_[5])
        sun.append(lm.coef_[6])
        thu.append(lm.coef_[7])
        tue.append(lm.coef_[8])
    
    else:
        print("File: {}, not found".format(filename))
        continue
    
    count += 1  
    if count >= 100:
        hundreds += 100
        print(hundreds, end=", ")
        count = 0        
 

100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 

In [42]:
# Convert lists into dataframes
Coefficients['segment'] = segment
Coefficients['intercept'] = intercept
Coefficients['arrivaltime'] = arrivaltime
Coefficients['rain'] = rain
Coefficients['holiday'] = holiday
Coefficients['fri'] = fri
Coefficients['mon'] = mon
Coefficients['sat'] = sat
Coefficients['sun'] = sun
Coefficients['thu'] = thu
Coefficients['tue'] = tue

In [43]:
Coefficients.head(5)

Unnamed: 0,segment,intercept,arrivaltime,rain,holiday,mon,tue,thu,fri,sat,sun
0,7183_4880,78.560115,-0.000284,-0.313564,-2.600906,-1.324411,-0.083571,-0.971274,-0.384542,-4.189973,-9.666895
1,3155_3156,30.662306,-0.000139,0.482527,-1.690592,0.145461,1.655619,0.90218,1.201202,-1.948165,-1.670823
2,1845_1847,132.75458,-0.000385,53.237907,-13.758668,2.244599,-6.940758,1.265589,12.345858,4.940886,17.393461
3,1601_1602,50.855529,6.8e-05,-0.814369,0.035784,0.793669,-0.8243,-0.052062,0.361737,-2.717113,-4.477299
4,4002_5112,38.290251,2e-05,-1.122892,0.35359,0.796137,1.781789,2.034758,3.582442,-0.084879,5.07358


In [44]:
Coefficients.loc[Coefficients['segment'] == '4323_4324']

Unnamed: 0,segment,intercept,arrivaltime,rain,holiday,mon,tue,thu,fri,sat,sun
21,4323_4324,287.873264,-0.000168,0.379277,-12.152546,0.300354,-1.715763,2.941781,2.436404,-16.178807,-12.382918


In [46]:
# Write coefficients to database table

Coefficients.to_sql('main_coefficients', engine, if_exists='append', index=False)

2018-08-15 12:49:22,640 INFO sqlalchemy.engine.base.Engine select version()
2018-08-15 12:49:22,643 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 12:49:22,656 INFO sqlalchemy.engine.base.Engine select current_schema()
2018-08-15 12:49:22,658 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 12:49:22,665 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2018-08-15 12:49:22,666 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 12:49:22,673 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2018-08-15 12:49:22,674 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 12:49:22,680 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2018-08-15 12:49:22,681 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 12:49:22,688 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20