# Models

Multiple models are implemented here. Utilization Rates depends on temporal granularity.

This notebook includes models, where charging stations have multiple utilization rates and a single utilization rate. 

### Type A. 
    Chargings stations can have multple utilization rates because of different times throughout the day. Time series models are good here.

### Type B. 
    Charging stations can have a single utilization rate, where this one rate is the single point of measurement to determine how good a location is. This will rely on different POI types.

## Implemented Models in this notebook

Type A Models
- Time series methods
    - Exponential Smoothing
    - SARIMA
- Regression methods
    - Decision Tree (Decision Tree with Hour)
    
Type B Models
- KNeighborsRegressor
- Decision Tree (Decision Tree without Hour)

# Modeling with Timeseries Methods

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.optimize import leastsq

from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.metrics import mean_squared_error
import statsmodels
#!pip install warnings
#warnings.filterwarnings('ignore')



NameError: ignored

In [3]:
statsmodels.__version__

'0.10.2'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [65]:

#mickeys path

full_data = pd.read_csv('/content/drive/MyDrive/Berkeley/sp_21/dataX/Team 19 - Power Forward/V3 Demo/4_19_full_data.csv')

## CASEY PATHS
#weekly = pd.read_csv('/content/drive/MyDrive/Team 19 - Power Forward/Data/utilization_rates_data/mongodb-files/week_hourly_utilization_rates.csv')

#cleaned_UR = pd.read_csv('/content/drive/MyDrive/Team 19 - Power Forward/Data/utilization_rates_data/mongodb-files/Sat_Apr_17_22-50-10_2021_raw.csv')



## JUSTIN PATHS:
#weekly = pd.read_csv('utilization_rates_data/week_hourly_utilization_rates.csv')

#cleaned_UR = pd.read_csv('utilization_rates_data/cleaned_gmaps_scraped_data-Sat_Apr_17_22-50-10_2021.csv')

### Setting up the time series dataframe

In [None]:
cleaned_UR['formatted_date_LA-time'] = pd.to_datetime(cleaned_UR['time'],unit='s').dt.tz_localize('UTC').dt.tz_convert('America/Los_Angeles')

cleaned_UR['floor_datetime'] = cleaned_UR['formatted_date_LA-time'].dt.floor('1h')

In [None]:
cleaned_UR.head()

Unnamed: 0.1,Unnamed: 0,_id,time,datetime,connection_type,available,power_kwh,address,formatted_date_LA-time,floor_datetime
0,0,606461777719ae019765201a,1617191277,Wed Mar 31 04:47:57 2021,CHAdeMO,4/4,50 kW,230 Bay Pl,2021-03-31 04:47:57-07:00,2021-03-31 04:00:00-07:00
1,1,606461777719ae019765201b,1617191277,Wed Mar 31 04:47:57 2021,CCS,4/4,50 kW,230 Bay Pl,2021-03-31 04:47:57-07:00,2021-03-31 04:00:00-07:00
2,2,606461777719ae019765201c,1617191277,Wed Mar 31 04:47:57 2021,J1772,1/1,6.2 kW,4400 Shellmound St,2021-03-31 04:47:57-07:00,2021-03-31 04:00:00-07:00
3,3,606461777719ae019765201d,1617191277,Wed Mar 31 04:47:57 2021,J1772,3/3,5 kW,4400 Shellmound St,2021-03-31 04:47:57-07:00,2021-03-31 04:00:00-07:00
4,4,606461787719ae019765201e,1617191277,Wed Mar 31 04:47:57 2021,CCS,11,150 kW,2700 Fifth St,2021-03-31 04:47:57-07:00,2021-03-31 04:00:00-07:00


In [None]:
timeseries_cols = ['connection_type', 'power_kwh', 'power_kwh_num', 
              'mapped_address', 'latitude', 'longitude', 
              'floor_datetime','time',
               'available_denominator']

timeseries = cleaned_UR.groupby(by=timeseries_cols).agg('mean').reset_index()
## ignore charging stations that have no denominator
timeseries = timeseries[timeseries['available_denominator'] != 0.0].reset_index(drop=True)

timeseries['hour_availability_rate'] = timeseries['available_numerator']/timeseries['available_denominator']

timeseries['hour_utilization_rate'] = 1-timeseries['hour_availability_rate']

timeseries['hour_total_output_kwh'] = timeseries['power_kwh_num']*timeseries['hour_utilization_rate']


KeyError: ignored

In [None]:
timeseries.columns

In [None]:
timeseries[timeseries['hour_utilization_rate'] %1 !=0]
unneeded_cols = ['Unnamed: 0', 
                 'day_of_week_num', 'date',
                 'hour', 'minutes','seconds']

timeseries = timeseries.drop(unneeded_cols, axis=1).set_index('floor_datetime')

timeseries = timeseries.sort_index()

In [None]:
timeseries.head()

In [None]:
plt.figure(figsize=(12,8))
plt.xlabel('Hour of date')
plt.ylabel('Utilization rate of the hour')
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index)


In [None]:
fig, [ax1, ax2] = plt.subplots(nrows=2, ncols=1, figsize=(14,10), gridspec_kw={'wspace':.5, 'hspace':.5})

sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, ci=100, label='100% (All of collected Data)', ax=ax1)
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, ci=99.7, label='99.7% (3σ)', ax=ax1)
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, ci=95, label='95% (2σ)', ax=ax1)
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, ci=68, label='68% (1σ)', ax=ax1)
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, ci=50, label='50% (Half of collected Data)', ax=ax1)
sns.lineplot(y=timeseries['hour_utilization_rate'], x=timeseries.index, label='Average', ax=ax1)


ax1.legend(bbox_to_anchor=(1.15, .75))
ax1.set_title('Hourly Utilization Rates Timeseries', size=18)
ax1.set_xlabel('Hour of date',size=14)
ax1.set_ylabel('Utilization rate of the hour', size=14)

sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, ci=100, label='100% (All of collected Data)', ax=ax2)
sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, ci=99.7, label='99.7% (3σ)', ax=ax2)
sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, ci=95, label='95% (2σ)', ax=ax2)
sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, ci=68, label='68% (1σ)', ax=ax2)
sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, ci=50, label='50% (Half of collected Data)', ax=ax2)
sns.lineplot(y=timeseries['hour_total_output_kwh'], x=timeseries.index, label='Average', ax=ax2)

ax2.legend(bbox_to_anchor=(1.15, .75))
ax2.set_title('Hourly Utilization Rates Timeseries', size=18)
ax2.set_xlabel('Hour of date',size=14)
ax2.set_ylabel('Total Hourly Energy Output (in kWh)', size=14)



In [None]:
fig, [ax1, ax2] = plt.subplots(nrows=2, ncols=1, figsize=(14,10), gridspec_kw={'wspace':.5, 'hspace':.5})

sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=100, label='100%', ax=ax1)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=99.7, label='99.7%', ax=ax1)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=95, label='95%', ax=ax1)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=68, label='68%', ax=ax1)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=50, label='50%', ax=ax1)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_utilization_rate', ci=1, label='Average', ax=ax1)

ax1.legend(bbox_to_anchor=(1.15, .75))
ax1.set_title('Weekly utilization rates', size=18)
ax1.set_xlabel('Day of the week (1=Monday, 7=Sunday)',size=14)
ax1.set_ylabel('Hourly Utilization Rate', size=14)


sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=100, label='100%', ax=ax2)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=99.7, label='99.7%', ax=ax2)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=95, label='95%', ax=ax2)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=68, label='68%', ax=ax2)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=50, label='50%', ax=ax2)
sns.lineplot(data=weekly, x='weektime_series', y='weekly_total_output_kwh', ci=1, label='Average', ax=ax2)

ax2.set_title('Hourly power output throughout the week', size=18)
ax2.set_xlabel('Day of the week (1=Monday, 7=Sunday)',size=14)
ax2.set_ylabel('Total Hourly Output(kWh)', size=14)
ax2.legend(bbox_to_anchor=(1.15, .75))


### Modeling as Timeseries

In [None]:
timeseries['utc'] = pd.to_datetime(timeseries['time'],unit='s').dt.floor('1h')
timeseries.head()


In [None]:
timeseries['mapped_address'].value_counts()

In [None]:
selected_station = '230 Bay Pl, Oakland, CA 94612'
station_timeseries = timeseries[timeseries['mapped_address'] == selected_station].set_index('utc').reset_index().groupby(by='utc').agg('mean')
station_timeseries.index = pd.DatetimeIndex(station_timeseries.index).to_period('H')

train = station_timeseries[station_timeseries.index <= '2021-04-10 00:00:00']
test  = station_timeseries[station_timeseries.index > '2021-04-10 00:00:00']
y     = 'hour_utilization_rate'

In [None]:
train[y].index[0]

In [None]:
station_timeseries

In [None]:
len(set(train.index)), len(set(test.index))

### Exponential Smoothing

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
exp_smooth_model = ExponentialSmoothing(train[y], 
                                        seasonal='add',
                                        seasonal_periods=7*24, freq='H'
                                       )

exp_smooth_fit = exp_smooth_model.fit(optimized=True, smoothing_level=1)

exp_smooth_pred = pd.Series(index=test.index, data=exp_smooth_fit.forecast(len(test.index)).values)



In [None]:
plt.plot(exp_smooth_pred.values, label= 'alpha, beta, gamma = recommended')
# exp_smooth_fit.fittedvalues.plot(style='--', color='green')
plt.plot(station_timeseries[y].values)
plt.legend()

In [None]:
plt.figure(figsize=(14,5))
exp_smooth_fit.fittedvalues.plot(x=station_timeseries.index, style='--', color='red', label='fitted Exponential Smoothing')
exp_smooth_pred.plot(x=station_timeseries.index, style='--', color='green', label='forecasted Exponential Smoothing')
sns.lineplot(y=station_timeseries[y].values, x=station_timeseries.index, label=y)

### SARIMA model


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

sarimax_model = SARIMAX(train[y], order=(1,1,1), seasonal_order=(1,1,1, 24))
sarimax_fit = sarimax_model.fit()

sarima_pred = pd.Series(index=test.index, data=sarimax_fit.forecast(len(test.index)).values)


In [None]:
plt.figure(figsize=(14,5))
sarimax_fit.fittedvalues.plot(x=station_timeseries.index, style='--', color='red', label='fitted SARIMAX')
sarima_pred.plot(x=station_timeseries.index, style='--', color='green', label='forecasted SARIMAX')
sns.lineplot(y=station_timeseries[y].values, x=station_timeseries.index, label=y)

### Time series Model Comparison Using MAPE

In [None]:
def get_mape(actuals, forecasts):
    mape = []
    for f, a in zip(forecasts, actuals):
        if a != 0:
            mape.append(abs(f-a)/a)
        else:
            mape.append(f)
    return np.mean(mape)

In [None]:
exp_smooth_mape = get_mape(test[y].values, exp_smooth_pred.values)
sarima_mape = get_mape(test[y].values, sarima_pred.values)

print('EXPONENTIAL SMOOTHING MAPE: ', exp_smooth_mape)
print('SARIMA MAPE: ', sarima_mape)

## Modeling with Regression Methods

### Setting up the regression data

In [None]:
address_mapper_df = pd.read_csv('data/address_mapper.csv')
address_mapper = {}
for o, c in zip(address_mapper_df['original_addy'], address_mapper_df['crossed']):
    address_mapper[o] = c
    
address_mapper_df.head()

FileNotFoundError: ignored

In [None]:
X_types_matrix = pd.read_csv('data/X_matrix_april_12.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_types_matrix['mapped_address'] = X_types_matrix['address'].replace(address_mapper)
X_types_matrix.head()

FileNotFoundError: ignored

In [None]:
timeseries.head()

NameError: ignored

In [None]:
full_data = pd.merge(left=timeseries.reset_index(), right=X_types_matrix, left_on='mapped_address', right_on='mapped_address', how='right')
full_data['hour'] = full_X.floor_datetime.dt.hour
full_data = full_data.set_index('floor_datetime')

NameError: ignored

In [None]:
print(len(full_data['mapped_address'].unique()), X_types_matrix.shape)
full_data

In [None]:
print(full_data.columns)
full_data.to_csv('data/full_weekly_timeseries_with_POIs.csv')
full_data#[['mapped_address', 'latitude_x', 'longitude_x','address', 'latitude_y', 'longitude_y',]]

In [None]:
poi_types = ['lodging', 'supermarket', 'pharmacy', 'park', 'restaurant',
       'clothing_store', 'store', 'school', 'gym', 'library',
       'local_government_office', 'doctor', 'stadium', 'museum', 'church',
       'synagogue']
time_cols = ['hour']
X_cols = time_cols + poi_types
y      = 'hour_utilization_rate'

full_train = full_data[full_data.index <= '2021-04-10 00:00:00']
full_test  = full_data[full_data.index > '2021-04-10 00:00:00']

X_train = full_train[X_cols]
y_train = full_train[y]

X_test  = full_test[X_cols]
y_test  = full_test[y]

print(len(X_train), len(y_train), len(X_test), len(y_test))

In [66]:
## Load the required modules
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, validation_curve, cross_val_score, GridSearchCV
from matplotlib import pyplot as plt
from timeit import default_timer as timer

## Load the required module
from sklearn.model_selection import cross_val_score, KFold

In [None]:
kf = KFold(4, shuffle=True, random_state=0)

### Decision Tree with Hour

In [None]:
dt_model = DecisionTreeRegressor(random_state=0, max_depth=5)
dt_fit = dt_model.fit(X_train, y_train)

In [None]:
dt_cv_scores = cross_val_score(dt_model,X_train,y_train,cv=kf)
print ("Cross-Validation Accuracies:", dt_cv_scores)
print ("Overall CV score is:", np.mean(dt_cv_scores))

In [None]:
dt_train_pred = dt_fit.predict(X_train)
dt_test_pred = dt_fit.predict(X_test)


In [None]:
pd.DataFrame({'dt_pred_test': dt_test_pred, 'dt_actual_test': y_test})

In [None]:
plt.figure(figsize=(14,5))

sns.lineplot(y=dt_train_pred, x=X_train.index, label='Predicted Utilization Rate Train Set')
sns.lineplot(y=y_train, x=X_train.index, label='Actual Utilization Rate Train Set')

sns.lineplot(y=dt_test_pred, x=X_test.index, label='Predicted Utilization Rate Test Set')
sns.lineplot(y=y_test, x=X_test.index, label='Actual Utilization Rate Test Set')



### Decision Tree without Hour


In [None]:
dt_model = DecisionTreeRegressor(random_state=0, max_depth=5)
dt_fit = dt_model.fit(X_train[poi_types], y_train)

In [None]:
dt_cv_scores = cross_val_score(dt_model,X_train,y_train,cv=kf)
print ("Cross-Validation Accuracies:", dt_cv_scores)
print ("Overall CV score is:", np.mean(dt_cv_scores))

In [None]:
dt_train_pred = dt_fit.predict(X_train[poi_types])
dt_test_pred = dt_fit.predict(X_test[poi_types])


In [None]:
plt.figure(figsize=(14,5))

sns.lineplot(y=dt_train_pred, x=X_train.index, label='Predicted Utilization Rate Train Set')
sns.lineplot(y=y_train, x=X_train.index, label='Actual Utilization Rate Train Set')

sns.lineplot(y=dt_test_pred, x=X_test.index, label='Predicted Utilization Rate Test Set')
sns.lineplot(y=y_test, x=X_test.index, label='Actual Utilization Rate Test Set')



## Pure Utilization Rate

Previously, each station was assigned multiple utilization rates, depending on time of date.

This section will assign a single utilization rate for a single station, instead of what was done previously.

### Aggregating the data

Some of the data is skewed because we have an uneven distribution of utilizaion rate samples across the stations.

In [None]:
full_data.head()

NameError: ignored

In [None]:

percentile_95_counts = full_data['mapped_address'].value_counts().mean() - 2*full_data['mapped_address'].value_counts().std()
full_data['mapped_address'].value_counts()
kept_address = []
for idx, row in pd.DataFrame(full_data['mapped_address'].value_counts()[:60]).iterrows():
    kept_address.append(idx)


In [None]:
stationed_full_data = full_data.groupby(['mapped_address', 'latitude_x', 'longitude_x', 'available_denominator', 'power_kwh_num', 'power_kwh', 'connection_type']).agg('mean').reset_index()

stationed_full_data = stationed_full_data[stationed_full_data['mapped_address'].isin(kept_address)].set_index('mapped_address')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(stationed_full_data[poi_types], stationed_full_data[y], test_size=0.25, random_state=2021)

## KNeighbors with Hour

In [67]:
#mickeys work 



X_matrix = full_data[['lodging', 'supermarket', 'pharmacy', 'park',
       'restaurant', 'clothing_store', 'store', 'school', 'gym', 'library',
       'local_government_office', 'doctor', 'stadium', 'museum', 'church',
       'synagogue']]
y_values = full_data['special_y']



x_train, x_test, y_train, y_test = train_test_split(X_matrix,y_values,test_size=0.2,random_state=0)



print("Shape of the training images array:", X_train.shape)
print("Shape of the training labels array:", y_train.shape)
print("Shape of the validation images array:",X_val.shape)
print("Shape of the validation labels array:",y_val.shape)

Shape of the training images array: (67, 16)
Shape of the training labels array: (67,)
Shape of the validation images array: (17, 16)
Shape of the validation labels array: (17,)


In [73]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(x_train, y_train)


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='uniform')

In [74]:
neigh_train_pred = neigh.predict(x_train)
neigh_test_pred = neigh.predict(x_test)


In [75]:
pd.DataFrame({'actual train': y_train, 'predicted train': neigh_train_pred})

Unnamed: 0,actual train,predicted train
73,0.750416,2.678263
48,5.301408,3.089276
74,0.998299,1.841604
83,0.292453,0.479230
61,1.338677,1.535841
...,...,...
80,0.029240,2.354587
67,0.073409,0.200653
64,0.877143,3.089276
47,0.706048,0.385474


In [76]:
pd.DataFrame({'actual train': y_test, 'predicted train': neigh_test_pred})

Unnamed: 0,actual train,predicted train
30,0.505455,1.572109
40,7.433071,1.591145
43,0.237898,0.532352
50,0.842857,1.535841
22,0.292581,0.467705
54,2.537651,0.501118
2,0.966815,0.116725
56,2.574775,1.535841
26,3.929438,3.189191
8,0.503126,2.362323


In [77]:
print('Training MSE: ', mean_squared_error(neigh_train_pred, y_train))
print('Testing MSE: ', mean_squared_error(neigh_test_pred, y_test))


Training MSE:  4.518772652045411
Testing MSE:  23.739743674315424


In [87]:
berkeley_coord = pd.read_csv('/content/drive/MyDrive/Berkeley/sp_21/dataX/Team 19 - Power Forward/V3 Demo/100_berk_points/99intermediate.csv')

berkeley_coord

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,lodging,supermarket,pharmacy,park,restaurant,clothing_store,store,school,gym,library,local_government_office,doctor,stadium,museum,church,synagogue,lat,lon
1,1,3,2,3,1,15,1,40,16,8,0,2,60,1,0,3,0,37.862827,-122.270287
2,2,3,2,3,2,14,1,36,16,8,0,2,60,1,0,2,0,37.862827,-122.27087744444444
3,3,3,2,3,2,11,1,34,15,8,0,2,60,1,0,3,0,37.862827,-122.27146788888888
4,4,2,2,3,2,6,1,28,15,8,0,2,60,1,0,3,0,37.862827,-122.27205833333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,1,1,0,4,12,4,43,7,5,2,1,33,0,0,10,0,37.852367,-122.27323922222222
97,97,1,1,0,4,14,4,42,7,5,0,1,34,0,0,8,0,37.852367,-122.27382966666666
98,98,1,1,0,3,12,4,41,6,3,0,2,36,0,0,8,0,37.852367,-122.27442011111111
99,99,1,1,0,1,11,4,36,6,2,0,2,30,0,0,8,0,37.852367,-122.27501055555555


In [88]:
berkeley_coord.columns = berkeley_coord.iloc[0]

In [89]:
testdf = berkeley_coord[['lodging', 'supermarket', 'pharmacy', 'park',
       'restaurant', 'clothing_store', 'store', 'school', 'gym', 'library',
       'local_government_office', 'doctor', 'stadium', 'museum', 'church',
       'synagogue']]
testdf = testdf.iloc[1:]




In [90]:
testdf

Unnamed: 0,lodging,supermarket,pharmacy,park,restaurant,clothing_store,store,school,gym,library,local_government_office,doctor,stadium,museum,church,synagogue
1,3,2,3,1,15,1,40,16,8,0,2,60,1,0,3,0
2,3,2,3,2,14,1,36,16,8,0,2,60,1,0,2,0
3,3,2,3,2,11,1,34,15,8,0,2,60,1,0,3,0
4,2,2,3,2,6,1,28,15,8,0,2,60,1,0,3,0
5,2,1,2,2,4,1,22,16,5,0,2,60,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,1,0,4,12,4,43,7,5,2,1,33,0,0,10,0
97,1,1,0,4,14,4,42,7,5,0,1,34,0,0,8,0
98,1,1,0,3,12,4,41,6,3,0,2,36,0,0,8,0
99,1,1,0,1,11,4,36,6,2,0,2,30,0,0,8,0


In [91]:
berkeley_pred = neigh.predict(testdf)
berkeley_pred

array([2.96457894, 1.59114528, 1.59114528, 4.55974774, 4.55974774,
       4.55974774, 3.99760192, 3.99760192, 3.99760192, 3.99760192,
       0.86174978, 1.59114528, 1.59114528, 1.59114528, 4.55974774,
       3.99760192, 3.99760192, 3.99760192, 3.99760192, 2.87208695,
       1.59114528, 1.59114528, 1.59114528, 4.55974774, 4.55974774,
       3.99760192, 3.99760192, 3.99760192, 2.87208695, 2.87208695,
       1.59114528, 1.59114528, 4.55974774, 4.55974774, 4.55974774,
       2.87208695, 2.87208695, 2.87208695, 2.74814528, 2.74814528,
       0.86174978, 0.86174978, 1.53584053, 1.14127017, 3.02918729,
       2.87208695, 2.87208695, 2.74814528, 2.87208695, 2.87208695,
       1.59114528, 1.59114528, 1.59114528, 3.04227597, 3.04227597,
       3.02918729, 2.87208695, 2.87208695, 2.87208695, 2.87208695,
       1.59114528, 1.59114528, 1.59114528, 4.55974774, 3.52880339,
       3.71539169, 3.71539169, 0.69295473, 0.05500538, 0.51665592,
       1.59114528, 1.59114528, 4.55974774, 3.04227597, 3.02918

In [94]:
berkeley_coord = berkeley_coord.iloc[1:]
berkeley_coord['predictions'] = berkeley_pred
berkeley_coord

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,0,lodging,supermarket,pharmacy,park,restaurant,clothing_store,store,school,gym,library,local_government_office,doctor,stadium,museum,church,synagogue,lat,lon,predictions
1,1,3,2,3,1,15,1,40,16,8,0,2,60,1,0,3,0,37.862827,-122.270287,2.964579
2,2,3,2,3,2,14,1,36,16,8,0,2,60,1,0,2,0,37.862827,-122.27087744444444,1.591145
3,3,3,2,3,2,11,1,34,15,8,0,2,60,1,0,3,0,37.862827,-122.27146788888888,1.591145
4,4,2,2,3,2,6,1,28,15,8,0,2,60,1,0,3,0,37.862827,-122.27205833333333,4.559748
5,5,2,1,2,2,4,1,22,16,5,0,2,60,1,0,3,0,37.862827,-122.27264877777777,4.559748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,1,1,0,4,12,4,43,7,5,2,1,33,0,0,10,0,37.852367,-122.27323922222222,1.535841
97,97,1,1,0,4,14,4,42,7,5,0,1,34,0,0,8,0,37.852367,-122.27382966666666,1.535841
98,98,1,1,0,3,12,4,41,6,3,0,2,36,0,0,8,0,37.852367,-122.27442011111111,1.535841
99,99,1,1,0,1,11,4,36,6,2,0,2,30,0,0,8,0,37.852367,-122.27501055555555,1.535841


In [95]:
#berkeley_coord.to_csv("/content/drive/MyDrive/Berkeley/sp_21/dataX/Team 19 - Power Forward/V3 Demo/'KNRegressor_pred.csv")