### Create Coefficients for each Segment

In [1]:
import pandas as pd
from sklearn import linear_model

In [2]:
# Connect to database

from sqlalchemy import create_engine

URI="localhost"
PORT="5433"
DB = "jetaDb"
USER = "postgres"
PASSWORD = "00001234"
  
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(USER, PASSWORD, URI, PORT, DB), echo=True)

  """)


In [3]:
# Making the dataframe that will hold the coefficients for each segment

Coefficients = pd.DataFrame(columns=['segment', 'intercept', 'arrivaltime', 'rain', 'holiday', 
                                    'mon', 'tue', 'thu', 'fri', 'sat', 'sun',])

In [4]:
# Lists to hold the data that will be added to our coefficients dataframe
segment = []
intercept = []
arrivaltime = []
rain = []
holiday = []
fri = []
mon = []
sat = []
sun = []
thu = []
tue = []
wed = []

In [5]:
import os

# Create list of all files in segment directory
segment_files = os.listdir("/home/isaac/Data/SegmentedSamples2_actual_final/")

# Remove .csv from file names
segment_files = [file[:-4] for file in segment_files]

In [None]:
# Column names for the files

colnames = ['segments', 'arrivaltime', 'dayofweek', 'rain', 'holiday', 
            'citycenter', 'distance', 'kalman_time', 'traveltime']

In [None]:
# Running a loop through segments and learning simple linear regression model through each segment

count = 0
hundreds = 0

for file in segment_files:
    fname = "/home/isaac/Data/SegmentedSamples2_actual_final/{}.csv".format(file)
    
    if os.path.isfile(fname):
        # Read the file in a dataframe
        df = pd.read_csv(fname, names = colnames,  delimiter=',')
        
        df.drop(['segments', 'citycenter', 'distance', 'kalman_time'], axis=1, inplace=True)
        
        # Converting the datatypes
        df['dayofweek'] = df['dayofweek'].astype('category')
        
        # Rain had missing values
        df['rain'].fillna(df['rain'].mean(), inplace=True)
        
        # Split the dataset into independent and dependent features
        df_X = df[['arrivaltime', 'dayofweek', 'rain', 'holiday']]
        df_y = df['traveltime']
        
        # Dummify day of week
        weekday_dummies = pd.get_dummies(df_X.dayofweek, prefix='dayofweek')
        
        # Check that segment has each day of the week
        if weekday_dummies.shape[1] != 7:
            #print(file, "has", weekday_dummies.shape[1], "weekdays. Ignoring segment...")
            continue
        
        df_X_dummied = pd.concat([df_X,weekday_dummies],axis=1)
        df_X_dummied = df_X_dummied.drop('dayofweek',axis=1)
        df_X_dummied = df_X_dummied.drop('dayofweek_Wednesday',axis=1) # To avoid dummy variable trap
        
        lm = linear_model.LinearRegression()
        lm.fit(df_X_dummied, df_y)
        
        # Writing the coefficients in the lists
        segment.append(file)
        intercept.append(lm.intercept_)
        arrivaltime.append(lm.coef_[0])
        rain.append(lm.coef_[1])
        holiday.append(lm.coef_[2])
        fri.append(lm.coef_[3])
        mon.append(lm.coef_[4])
        sat.append(lm.coef_[5])
        sun.append(lm.coef_[6])
        thu.append(lm.coef_[7])
        tue.append(lm.coef_[8])
    
    else:
        print("File: {}, not found".format(fname))
        continue
    
    count += 1  
    if count >= 100:
        hundreds += 100
        print(hundreds, end=", ")
        count = 0        
 

100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 

In [None]:
# Convert lists into dataframes
Coefficients['segment'] = segment
Coefficients['intercept'] = intercept
Coefficients['arrivaltime'] = arrivaltime
Coefficients['rain'] = rain
Coefficients['holiday'] = holiday
Coefficients['fri'] = fri
Coefficients['mon'] = mon
Coefficients['sat'] = sat
Coefficients['sun'] = sun
Coefficients['thu'] = thu
Coefficients['tue'] = tue

In [None]:
Coefficients.head(5)

In [None]:
Coefficients.loc[Coefficients['segment'] == '4323_4324']

In [None]:
# Write coefficients to database table

Coefficients.rename(columns={'Segment': 'segment'}, inplace=True)

Coefficients.to_sql('main_coefficients', engine, if_exists='append', index=False)