# IMPORTS

In [1]:
%reload_ext autoreload
%autoreload 2
# GENERAL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

# LOADING DATA
import requests
import os
import shutil
from io import BytesIO
import osmium
import fiona
import json


# GEOSPATIAL DATA
from shapely.geometry import Point, LineString, MultiPoint
from shapely.geometry import shape 


# PLOTTING DATA
from folium import Map, CircleMarker, Vega, Popup, Marker, PolyLine, Icon, Choropleth, LayerControl
from folium.plugins import MarkerCluster, HeatMap, BeautifyIcon
import shapely
import matplotlib
from ipywidgets import interact
import seaborn as sns

# STATS
import math
import stats

# ML
import scipy
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler



# Preprocessing 

## database n.1 : ROAD RISK ~ Shape file

The first database we are cleaning is a shape file containing segments of road in Porto and Lisbon. 
In the next steps we are preparing the dataset for the future analysis.

### Extracting Data

In [2]:
# link = 'https://wdl-data.fra1.digitaloceanspaces.com/pse/m_risk_prfile.zip'
# s = requests.get(link).content

In [3]:
def first_df(path):
    geo = gpd.read_file(path)
    df = pd.DataFrame(geo).drop(columns='Link_ID')
    df_ren = df.rename(columns={
                        'Daily_Aver':'Daily_Average_Traffic_Intensity',
                        'Average_Ve':'Average_Velocity_of_Vehicle_Traffic',
                        'Median_of_':'Median_of_velocity_of_Vehicle_Traffic',
                        'First_Quar': 'FirstQuartil_of_velocity_of_Vehicle_Traffic',
                        'Third_Quar': 'ThirdQuartil_of_velocity_of_Vehicle_Traffic'
                    })
    return df_ren
    
df = first_df('wdl_data/m_risk_prfile.geojson') 

- Now we have a dataset containing the same columns but renamed. 
- We also drop the 'Link_ID' column as contain the unique id of the streat, information already present in linkid column

To have a better understanding on our data and to avoid errors during our analysis we need to investigate it with general statistics

### Remove outliers: 

In [4]:
df.describe()

Unnamed: 0,linkid,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Func_Class,Speed_Cat
count,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0
mean,895820600.0,3340.417942,56.816834,56.463409,43.822041,68.091844,2.684613,4.904781
std,235591000.0,2725.873982,51.98367,26.240876,24.442204,30.985191,0.538658,1.520568
min,80216820.0,14.435864,-401.703724,1.0,-392.5,1.0,1.0,2.0
25%,736483200.0,1903.398108,38.315321,38.25,26.0,48.0,2.0,4.0
50%,906737700.0,2644.529317,49.966126,50.0,38.875,60.333333,3.0,6.0
75%,1154997000.0,3897.886608,69.511585,71.0,56.0,85.0,3.0,6.0
max,1223731000.0,49309.806935,6357.022296,1326.25,143.0,2605.0,3.0,7.0


- Regarding the columns we know that they report values in km/h: many of the min and max we can observe thank to describe function don't make sense. 
- We need to operate on them as they are **outliers**

In [5]:
def rm_out(df):
    for i in df.columns.drop(['linkid', 'Daily_Average_Traffic_Intensity','geometry']):
        lb = 0
        ub = 180
#         print(lb, ub)
        df[i] = df[i].mask(df[i] < lb) 
        df[i] = df[i].mask(df[i] > ub) 
    return df

data = rm_out(df)

**If we run describe again we will see that the data regarding velocity has just feasible values.**

In [6]:
data.describe()

Unnamed: 0,linkid,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Func_Class,Speed_Cat
count,34678.0,34678.0,34633.0,34675.0,34677.0,34674.0,34678.0,34678.0
mean,895820600.0,3340.417942,56.112805,56.402803,43.834624,67.959767,2.684613,4.904781
std,235591000.0,2725.873982,24.346245,25.054113,24.329987,26.706718,0.538658,1.520568
min,80216820.0,14.435864,1.0,1.0,0.0,1.0,1.0,2.0
25%,736483200.0,1903.398108,38.317003,38.25,26.0,48.0,2.0,4.0
50%,906737700.0,2644.529317,49.961538,50.0,38.875,60.333333,3.0,6.0
75%,1154997000.0,3897.886608,69.447459,71.0,56.0,85.0,3.0,6.0
max,1223731000.0,49309.806935,179.691892,143.25,143.0,164.0,3.0,7.0


### Handling duplicates:

In [7]:
len(data) == len(data.drop_duplicates())

True

There are ***no duplicates*** in our dataset

### Handling missing values:

In [8]:
data.isnull().sum().sort_values(ascending=False) , f'Total of data points : {data.shape[0]}'

(Average_Velocity_of_Vehicle_Traffic            45
 ThirdQuartil_of_velocity_of_Vehicle_Traffic     4
 Median_of_velocity_of_Vehicle_Traffic           3
 FirstQuartil_of_velocity_of_Vehicle_Traffic     1
 linkid                                          0
 Daily_Average_Traffic_Intensity                 0
 Func_Class                                      0
 Speed_Cat                                       0
 geometry                                        0
 dtype: int64,
 'Total of data points : 34678')

As we can see the total highest number of missing values detected for column is 45 on a total number of rows of 34'678.
- The missing value for Average Velocity e the ones in Speed Difference Mean are the same (one column is created from the other one)
- The missing values of other column can be easily deleted

**As we are handling data regarding AVERAGE velocity we can easily substitute the missing values with the mean of the corresponding column**

In [9]:
def handling_missing(data):
    imputer = SimpleImputer()
    data['Average_Velocity_of_Vehicle_Traffic']=imputer.fit_transform(data[['Average_Velocity_of_Vehicle_Traffic']])
    return data.dropna()
data = handling_missing(data)

In [10]:
data.isnull().sum()

linkid                                         0
Daily_Average_Traffic_Intensity                0
Average_Velocity_of_Vehicle_Traffic            0
Median_of_velocity_of_Vehicle_Traffic          0
FirstQuartil_of_velocity_of_Vehicle_Traffic    0
ThirdQuartil_of_velocity_of_Vehicle_Traffic    0
Func_Class                                     0
Speed_Cat                                      0
geometry                                       0
dtype: int64

### Feature creation:

Now looking at our data we need to search for a target that in the next step we will use in our model.
Most common causes of Accidents:
- Over Speeding.
- Drunken Driving.
- Distractions to Driver.
- Red Light Jumping.
- Avoiding Safety Gears like Seat belts and Helmets.
- Non-adherence to lane driving and overtaking in a wrong manner.

The first cause is always the **over-speed** that can be connected with one of the above causes. 
For this reason we decide to investigate and use as target information regarding the velocity.

- Speed_Cat (described in the excel above)
- Average Velocity of Vehicle Traffic 
- Median of velocity of Vehicle Traffic

We will create a dictionary that, from the information contained in the excel can describe the type of street regarding the max velocity allowed in there. 


In [11]:
speed_explanation = pd.read_excel('wdl_dict/Dictionary_Risk_Profiles.xlsx', sheet_name='SpeedCat')
speed_explanation

Unnamed: 0,Speed Cat,Speed range in km/h
0,1,>130 km/h
1,2,101-130 km/h
2,3,91-100 km/h
3,4,71-90 km/h
4,5,51-70 km/h
5,6,31-50 km/h
6,7,11-30 km/h
7,8,<11 km/h


 from the table above we can create a dictionary.
1. count values for category
2. translate the speed range in actual number

In [12]:
data.Speed_Cat.value_counts()

6    20307
4     5273
2     5123
3     1843
5     1770
7      357
Name: Speed_Cat, dtype: int64

**NO need of mapping for label 1 and 8**

In [13]:
max_speed_dict = {2:130,3:100,4:90,5:70,6:50,7:30}

In [14]:
def target_creation(data):
    data['Max_speed'] = data['Speed_Cat'].map(max_speed_dict)
    data['Speed_Diff_Mean'] = data['Max_speed'] - data['Average_Velocity_of_Vehicle_Traffic']
    data['Speed_Diff_Median'] = data['Max_speed'] - data['Median_of_velocity_of_Vehicle_Traffic']
    return data
data = target_creation(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Max_speed'] = data['Speed_Cat'].map(max_speed_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Speed_Diff_Mean'] = data['Max_speed'] - data['Average_Velocity_of_Vehicle_Traffic']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Speed_Diff_Median'] = data['Max_speed'] - data['Med

In [15]:
data[['Speed_Cat','Max_speed', 'Speed_Diff_Mean','Speed_Diff_Median']].head(10)

Unnamed: 0,Speed_Cat,Max_speed,Speed_Diff_Mean,Speed_Diff_Median
0,6,50,4.791284,5.535714
1,4,90,10.436692,7.25
2,6,50,-15.955069,-17.333333
3,6,50,2.466089,6.0
4,6,50,-1.375291,-3.25
5,6,50,17.667821,18.333333
6,6,50,19.613591,22.9375
7,6,50,26.714286,39.0
8,6,50,6.476409,4.0
9,6,50,-11.897045,1.346154


**our first target will be the difference between the mean of velocity and the max speed**

### Scaling features:

We are now ready to scale our dataframe to have a distribution *around* the mean.

1. We need to separate numerical and categorical column
2. We are going to use the Min-Max Scaling method for the numerical ones: is the one that is commonly used distance based algorithms, as k-means that is one of the possible analysis we are taking in consideration.  
3. For the categorical ones we'll use the OneHotEncoding method (for each label in each category creates a different column)

We could also operate this step all together but is important for us to know which column belong to each of the different classes inside the categorical feature. 
**To do so we need to operate for each categorical separately**

In [16]:
def scaling_numerical(data):
    numerical = data.columns.drop(['geometry','linkid','Speed_Cat', 'Func_Class'])
    scaler = StandardScaler()
    data_scaled = data.copy()
    for column in numerical:
        scaler.fit(data_scaled[[column]])
        data_scaled[column]=scaler.transform(data_scaled[[column]]) 
    return data_scaled
data_scaled = scaling_numerical(data)

In [17]:
data.drop(columns=['geometry','linkid','Speed_Cat', 'Func_Class']).head(1)

Unnamed: 0,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Max_speed,Speed_Diff_Mean,Speed_Diff_Median
0,6224.778569,45.208716,44.464286,28.0,59.0,50,4.791284,5.535714


In [18]:
data_scaled.drop(columns=['geometry','linkid','Speed_Cat', 'Func_Class']).head(1)

Unnamed: 0,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Max_speed,Speed_Diff_Mean,Speed_Diff_Median
0,1.058039,-0.448154,-0.476554,-0.650809,-0.33552,-0.716756,-0.549408,-0.506457


**Working with the categorical features the first thing we need to do is to understand the distribution within the labels**

In [19]:
data.Func_Class.value_counts() , data.Speed_Cat.value_counts()

(3    25023
 2     8363
 1     1287
 Name: Func_Class, dtype: int64,
 6    20307
 4     5273
 2     5123
 3     1843
 5     1770
 7      357
 Name: Speed_Cat, dtype: int64)

- Functional Class has just 3 possible label for the street that we can understand better looking at the excel

In [20]:
func_explanation = pd.read_excel('wdl_dict/Dictionary_Risk_Profiles.xlsx', sheet_name='Func_Class')
for i,el in enumerate(func_explanation['Description']):
    print(f'Class n.{i+1} : {el} \n')

Class n.1 : These roads are meant for high volume, maximum speed traffic between and through major metropolitan areas. There are very few, if any, speed changes. Access to this road is usually controlled. 

Class n.2 : These roads are used to channel traffic to Main Roads (FRC1) for travel between and through cities in the shortest amount of time. There are very few, if any speed changes. 

Class n.3 : These roads interconnect First Class Roads (FRC2) and provide a high volume of traffic movement at a lower level of mobility than First Class Roads (FRC2). 



With this new and deeper understanding of the distribution and the meaning of the category (*NB: regarding speed_cat we can look back at the point **1.1.5 "Feature creation"** to get these informations)* we can now progress with our transformations.  

In [21]:
def scaling_categorical(data):
    ohe = OneHotEncoder(sparse = False)
    ohe.fit(data[['Func_Class']])
    func_encoded = ohe.transform(data[['Func_Class']])
    data["func_1"],data["func_2"],data['func_3'] = func_encoded.T
    ohe = OneHotEncoder(sparse = False)
    ohe.fit(data[['Speed_Cat']])
    speed_encoded = ohe.transform(data[['Speed_Cat']])
    data["speed_2"],data["speed_3"],data["speed_4"],\
    data["speed_5"], data["speed_6"], data["speed_7"]= speed_encoded.T
    return data 

In [22]:
data_scaled = scaling_categorical(data_scaled)

### Preprocessed Dataframe: 

In [23]:
data_scaled.columns

Index(['linkid', 'Daily_Average_Traffic_Intensity',
       'Average_Velocity_of_Vehicle_Traffic',
       'Median_of_velocity_of_Vehicle_Traffic',
       'FirstQuartil_of_velocity_of_Vehicle_Traffic',
       'ThirdQuartil_of_velocity_of_Vehicle_Traffic', 'Func_Class',
       'Speed_Cat', 'geometry', 'Max_speed', 'Speed_Diff_Mean',
       'Speed_Diff_Median', 'func_1', 'func_2', 'func_3', 'speed_2', 'speed_3',
       'speed_4', 'speed_5', 'speed_6', 'speed_7'],
      dtype='object')

## Feature engineering

We assume that ***over speeding*** is the main reason for road hazards. Over speeding behavior can be extracted by the deltas between the road's speed category and its actual average speed observations as processed in column ```Speed_Diff_Mean```. <br>

Over speeding can be harnessed among others by the roads environment [Source](https://www.tandfonline.com/doi/abs/10.1080/014416499295420). People chose their speeding behavior not only by speed limits but also by their assessment of the road's quality and the surrounding environment.<br>

Therefore we chose to gather more information about POIs, amenities and public buildings in the surrounding of the provided road segments. Those can be acquired through OSM sources.

### Scaling data set to Lisbon

#########################################################<br>
***_----------FILL WITH EXPLANATION SISTO------------_***<br>
#########################################################

In [24]:
# Transforming pandas df to geopandas df
geo_df = gpd.GeoDataFrame(data_scaled)
geo_df.geometry[0].type

'MultiLineString'

In [25]:
## Filtering only lisbon data inside the circle of 38.72526068747401, -9.142352617846093 with buffer '1'
circle_lisbon = Point(-9.142352617846093, 38.72526068747401).buffer(1)
geo_lis = geo_df[geo_df.geometry.within(circle_lisbon)]
# no immediate usage of this pd.DataFrame
df_lis = pd.DataFrame(geo_lis).drop(columns=['geometry', 'linkid'])

In [26]:
print(f'The new data set has {df_lis.shape[0]} rows as opposed to the original set with {geo_df.shape[0]} rows')

The new data set has 17688 rows as opposed to the original set with 34673 rows


### Loading OSM Maps

In [27]:
%%bash
wget https://download.bbbike.org/osm/extract/planet_-9.89,38.265_-8.309,39.136.osm.pbf \
    --quiet -O map_data/Lisbon.osm.pbf

In [28]:
!ogrinfo map_data/Lisbon.osm.pbf

INFO: Open of `map_data/Lisbon.osm.pbf'
      using driver `OSM' successful.
1: points (Point)
2: lines (Line String)
3: multilinestrings (Multi Line String)
4: multipolygons (Multi Polygon)
5: other_relations (Geometry Collection)


In [None]:
%%bash
ogr2ogr -f "GPKG" \
    map_data/lisbon_polygons.gpkg \
    map_data/Lisbon.osm.pbf \
    -nlt POLYGONS \
    -nln polygons

In [None]:
#Read data
layer_file = "map_data/lisbon_polygons.gpkg"
collection = list(fiona.open(layer_file,'r'))
df1 = pd.DataFrame(collection)

#Check Geometry
def isvalid(geom):
    try:
        shape(geom)
        return 1
    except:
        return 0

df1['isvalid'] = df1['geometry'].apply(lambda x: isvalid(x))
df1 = df1[df1['isvalid'] == 1]
collection = json.loads(df1.to_json(orient='records'))

#Convert to geodataframe
gdf_lis_poly = gpd.GeoDataFrame.from_features(collection)

In [None]:
gdf_lis_poly

In [None]:
poi_gdf = gdf_lis_poly.copy()

In [None]:
poi_gdf

### Loading POIs from pre-processed OSM file

due to a large data set the file has to be loaded as pd.DataFrame before it can be transformed into a gpd.GeoDataFrame

poi_df = pd.read_csv('map_data/gdf_lis_poly.csv')

poi_df['geometry'] = poi_df['geometry'].apply(shapely.wkt.loads)

poi_gdf = gpd.GeoDataFrame(poi_df, crs='OGC:CRS84')

poi_gdf = gpd.GeoDataFrame(poi_df, crs='OGC:CRS84')

In [None]:
print(f'The data set of POIs in the Lisbon region has {poi_gdf.shape[0]} individual points which can be merged with our data set.' )

In [None]:
poi_gdf.geometry.type.value_counts()

***Note***<br>
For now we will only be focussing on the geometrical points in the OSM data, not on polygons or line strings.

In [None]:
filtering down to shapely.geometry.Points
gdf_points = poi_gdf[poi_gdf['geometry'].type == 'Point'].reset_index()
gdf_points.columns

**Note**<br>
The points provided are categorized and stored in many columns. We will shrink this information to one column and fill it with all the important information about the point. <br>
Some points do not provide any information. Those ones will be dropped. 

In [None]:
# reducing geo_df columns, only leaving one valid column
def new_desc(geo):
    geo['desc_points'] = None
    # columns to be taken into consideration
    lst_cols = [  'amenity', 
                  'barrier', 
                  'building', 
                  'highway', 
                  'landuse', 
                  'man_made', 
                  'natural', 
                  'office']
    for c, row in geo.iterrows():
        concat_name = [f'feat_{i}_{row[i]}' for i in lst_cols if row[i] == row[i]]
        if len(concat_name) > 0:
            geo.at[c, 'desc_points'] = concat_name[0]
        else: 
            geo.at[c, 'desc_points'] = None
        print(f'done: {c}')
        
    
    geo = geo[['geometry', 'desc_points']]
    # drop empty descriptions
    geo = geo.dropna(subset=['desc_points'])
    geo= geo.reset_index(drop=True)
    
    return geo

In [None]:
# applying cleaning function to geo df
gdf_points_clean = new_desc(gdf_points)

In [None]:
gdf_points_clean.head(5)
# only two columns are left => geometry and name of point

### Re-transforming point's names into columns

To prepare the dataset of points for the merger with the general data set we need to re-transfer the unique feature names into columns. In total we have **96** feature columns.

In [None]:
# encoding all unique values
encoder = OneHotEncoder()
enc_df = encoder.fit_transform(gdf_points_clean[['desc_points']])

In [None]:
# reapplying column names
enc_gdf_points = gpd.GeoDataFrame(enc_df.toarray(), columns=encoder.categories_[0])
enc_gdf_points = enc_gdf_points.join(gdf_points_clean)

**Note**<br>
We need the ```desc_points``` column for later plotting.

In [None]:
enc_gdf_points.max()

### Merging Points with Road segments

In order to merge the points with the provided road segments we need to buffer the LineStrings of the roads and turn them into little Polygons in order to overlap with the POIs around the road. Later we will use the ```.intersect``` method for spatial joins to keep only the points which are in the vicinity of the road segments.

In [None]:
# create a gdf with buffered road segments
geo_lis_buf = geo_lis.copy()
# allowing certain buffer to road segments to "catch" the points. buffer=.0005 seems to be visually adequate.
geo_lis_buf['geometry'] = geo_lis_buf.geometry.buffer(.0005)

In [None]:
# joining both geo dfs
joint_gpd = gpd.sjoin(enc_gdf_points, geo_lis_buf, how="inner", op='intersects')

In [None]:
print(f'We have {joint_gpd.shape[0]} intersecting points with our road segments.')

**Note**<br>
Now, we want to regroup the GDF back to our initial granularity, the road segments with unique link_IDs.

In [None]:
# building the aggregation dictionary for the .groupby method
columns = joint_gpd.columns
agg = {i:'sum' for i in columns if 'feat' in i}
agg['geometry'] = lambda x: list(x)
agg['desc_points'] = lambda x: list(x)
# agg
# all POIs should be summed, but the geometries of the containing points are to be listed in one cell.

In [None]:
# regrouping by linkid
grouped_gpd = joint_gpd.groupby('linkid').agg(agg)
# renaming the 'geometry' column so that the gdf won't be confused later
grouped_gpd = grouped_gpd.rename(columns={'geometry':'points'})

**Note**<br>
Only road segments which contained one or more points will be left in the gdf

In [None]:
geo_df_lis = geo_lis.merge(grouped_gpd, left_on='linkid', right_index=True)
geo_df_lis['point_count'] = geo_df_lis['points'].apply(lambda x: len(x))

In [None]:
# have a glance at the merged df
pd.set_option('display.max_columns', None)
geo_df_lis.head(3)

In [None]:
geo_df_lis = geo_df_lis.sort_values(by='linkid')

### Testplots for the merged data

In [None]:
# set of points to make them readable for the Marker (folium)
coords = [[(point.coords.xy[1][0], point.coords.xy[0][0]) for point in x] for x in geo_df_lis.head(100).points]
names = [x for x in geo_df_lis.head(100).desc_points]
map_lis_buf = geo_df_lis.copy()
map_lis_buf['geometry'] = map_lis_buf.geometry.buffer(.0005)

In [None]:
# init map
m = Map([38.74288, -9.16624])

# unpacking list of lists containing points, mapping them to their names
## POINTS (only a subset of 100)
marker_cluster = MarkerCluster(name='Points')
for i, coordi in enumerate(coords[:100]):
    popups = [Popup(f'<p><b>Name:</b></p> <p>{a}</p>', max_width=150) for a in names[i]]
    markers = [Marker(coord, popup=popups[c]).add_to(marker_cluster) for c, coord in enumerate(coordi)]

print('built points')

## BUFFERED ROADS (enable via layer control)
# only the first 100
roads_buf = Choropleth(geo_data=map_lis_buf.head(100).geometry,
                          data=None, 
                          name="roads_buf", 
                          show=False)


## ROADS
roads = Choropleth(geo_data=geo_df_lis.head(100).geometry,
                          data=None, 
                          name="roads", 
                          show=True)

print('built roads')

marker_cluster.add_to(m)
roads_buf.add_to(m)
roads.add_to(m)



LayerControl().add_to(m)
m

**Note**<br>
The displayed points all seem to be within the boundaries of the buffered road segments 🛣.<br>
NOT ALL points are displayed. That would take up too much memory. 

## Modeling

### Scaling new features

In [None]:
def standard_scaling(df, columns):
    scaler = StandardScaler()
    df.loc[:,columns] = scaler.fit_transform(df.loc[:,columns])

### Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [None]:
# Prepare X and y
X = final_data.drop(columns='Speed_Diff_Mean')
y = final_data['Speed_Diff_Mean']


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3, 
                                                    random_state = 6) # Holdout


baseline_model = DummyRegressor(strategy="mean") # Baseline
baseline_model.fit(X_train, y_train) # Calculate value for stratgy
baseline_model.score(X_test, y_test) # Score model based on consistently predicting the strategy

### Linear Regression

In [None]:
model = LinearRegression().fit(X_train, y_train) # instantiate and fit model 

model.score(X_test, y_test) # Score model

In [None]:
cv_results = cross_validate(model, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
                           )
pd.DataFrame(cv_results) # Cross validation output

In [None]:
cv_results['test_r2'].mean()# Cross validation results