In [1]:
import numpy as np
import pandas as pd     
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns    
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("latimes-county-totals.csv")

In [3]:
df.head()

Unnamed: 0,date,county,fips,confirmed_cases,deaths,new_confirmed_cases,new_deaths
0,2020-01-26,Alameda,1,0,0,,
1,2020-01-31,Alameda,1,0,0,0.0,0.0
2,2020-02-02,Alameda,1,0,0,0.0,0.0
3,2020-02-20,Alameda,1,0,0,0.0,0.0
4,2020-02-21,Alameda,1,0,0,0.0,0.0


In [4]:
df.describe()

Unnamed: 0,fips,confirmed_cases,deaths,new_confirmed_cases,new_deaths
count,3152.0,3152.0,3152.0,3094.0,3094.0
mean,59.317259,363.570749,12.736041,18.138009,0.739173
std,32.983935,1646.535743,73.90441,82.994716,4.295371
min,1.0,0.0,0.0,-2.0,-1.0
25%,29.0,3.0,0.0,0.0,0.0
50%,63.0,20.0,0.0,1.0,0.0
75%,85.0,155.0,3.0,9.0,0.0
max,115.0,26238.0,1260.0,1505.0,76.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 7 columns):
date                   3152 non-null object
county                 3152 non-null object
fips                   3152 non-null int64
confirmed_cases        3152 non-null int64
deaths                 3152 non-null int64
new_confirmed_cases    3094 non-null float64
new_deaths             3094 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 172.5+ KB


In [6]:
df2 = df.copy(deep=True)
df_la = df2[df2['county'] == 'Los Angeles']
df_la.head()

Unnamed: 0,date,county,fips,confirmed_cases,deaths,new_confirmed_cases,new_deaths
867,2020-01-26,Los Angeles,37,1,0,,
868,2020-01-27,Los Angeles,37,1,0,0.0,0.0
869,2020-01-28,Los Angeles,37,1,0,0.0,0.0
870,2020-01-29,Los Angeles,37,1,0,0.0,0.0
871,2020-01-30,Los Angeles,37,1,0,0.0,0.0


In [7]:
df_la = df_la.drop(labels=['fips'], axis=1)
print (df_la.info())
df_la.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 867 to 966
Data columns (total 6 columns):
date                   100 non-null object
county                 100 non-null object
confirmed_cases        100 non-null int64
deaths                 100 non-null int64
new_confirmed_cases    99 non-null float64
new_deaths             99 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 5.5+ KB
None


Unnamed: 0,confirmed_cases,deaths,new_confirmed_cases,new_deaths
count,100.0,100.0,99.0,99.0
mean,4806.74,200.35,265.020202,12.727273
std,7579.479313,356.384181,370.617686,19.588011
min,1.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0
50%,81.5,1.0,25.0,0.0
75%,8101.25,229.75,480.0,21.5
max,26238.0,1260.0,1505.0,76.0


In [8]:
#replace the only two NAs in the first day of new_confirmed_cases and new_deaths to 0
df_la = df_la.fillna(0)

In [9]:
days = list(range(1,len(df_la)+1))
df_la['culm_day'] = days

In [10]:
df_la.head()

Unnamed: 0,date,county,confirmed_cases,deaths,new_confirmed_cases,new_deaths,culm_day
867,2020-01-26,Los Angeles,1,0,0.0,0.0,1
868,2020-01-27,Los Angeles,1,0,0.0,0.0,2
869,2020-01-28,Los Angeles,1,0,0.0,0.0,3
870,2020-01-29,Los Angeles,1,0,0.0,0.0,4
871,2020-01-30,Los Angeles,1,0,0.0,0.0,5


In [11]:
df_la.columns

Index(['date', 'county', 'confirmed_cases', 'deaths', 'new_confirmed_cases',
       'new_deaths', 'culm_day'],
      dtype='object')

In [12]:
df_la = df_la.drop(labels=['date','county'], axis=1)
print (df_la.info())
df_la.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 867 to 966
Data columns (total 5 columns):
confirmed_cases        100 non-null int64
deaths                 100 non-null int64
new_confirmed_cases    100 non-null float64
new_deaths             100 non-null float64
culm_day               100 non-null int64
dtypes: float64(2), int64(3)
memory usage: 4.7 KB
None


Unnamed: 0,confirmed_cases,deaths,new_confirmed_cases,new_deaths,culm_day
count,100.0,100.0,100.0,100.0,100.0
mean,4806.74,200.35,262.37,12.6,50.5
std,7579.479313,356.384181,369.692273,19.530344,29.011492
min,1.0,0.0,0.0,0.0,1.0
25%,1.0,0.0,0.0,0.0,25.75
50%,81.5,1.0,20.0,0.0,50.5
75%,8101.25,229.75,474.5,20.75,75.25
max,26238.0,1260.0,1505.0,76.0,100.0


In [13]:
print(df_la)

     confirmed_cases  deaths  new_confirmed_cases  new_deaths  culm_day
867                1       0                  0.0         0.0         1
868                1       0                  0.0         0.0         2
869                1       0                  0.0         0.0         3
870                1       0                  0.0         0.0         4
871                1       0                  0.0         0.0         5
..               ...     ...                  ...         ...       ...
962            23233    1119                711.0        54.0        96
963            24262    1174               1029.0        55.0        97
964            24936    1212                674.0        38.0        98
965            25699    1231                763.0        19.0        99
966            26238    1260                539.0        29.0       100

[100 rows x 5 columns]


In [14]:
X = df_la.drop(labels=['deaths'], axis=1)
print (X)
y = df_la.loc[:,'deaths']
print (y)
print (y.unique())
X_col_names = X.columns.values
X_col_names

     confirmed_cases  new_confirmed_cases  new_deaths  culm_day
867                1                  0.0         0.0         1
868                1                  0.0         0.0         2
869                1                  0.0         0.0         3
870                1                  0.0         0.0         4
871                1                  0.0         0.0         5
..               ...                  ...         ...       ...
962            23233                711.0        54.0        96
963            24262               1029.0        55.0        97
964            24936                674.0        38.0        98
965            25699                763.0        19.0        99
966            26238                539.0        29.0       100

[100 rows x 4 columns]
867       0
868       0
869       0
870       0
871       0
       ... 
962    1119
963    1174
964    1212
965    1231
966    1260
Name: deaths, Length: 100, dtype: int64
[   0    1    2    4    5    8   10 

array(['confirmed_cases', 'new_confirmed_cases', 'new_deaths', 'culm_day'],
      dtype=object)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(X_train)
print(len(X_train),len(y_train))
print(len(X_test),len(y_test))

     confirmed_cases  new_confirmed_cases  new_deaths  culm_day
934             4071                543.0        14.0        68
966            26238                539.0        29.0       100
921              295                 64.0         0.0        55
962            23233                711.0        54.0        96
955            17567               1118.0        66.0        89
..               ...                  ...         ...       ...
942             8453                469.0        19.0        76
876                1                  0.0         0.0        10
939             6944                553.0        26.0        73
879                1                  0.0         0.0        13
904                1                  0.0         0.0        38

[70 rows x 4 columns]
70 70
30 30


In [16]:
print (X_test)
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax 

     confirmed_cases  new_confirmed_cases  new_deaths  culm_day
947            10517                442.0        39.0        81
951            12349                298.0        24.0        85
900                1                  0.0         0.0        34
948            10895                378.0        52.0        82
960            21017                557.0        56.0        94
884                1                  0.0         0.0        18
903                1                  0.0         0.0        37
949            11421                526.0        44.0        83
936             5325                720.0        26.0        70
932             3037                532.0        10.0        66
959            20460                893.0        32.0        93
906               11                  4.0         0.0        40
923              411                 58.0         1.0        57
919              192                 45.0         0.0        53
918              147                 53.

array([[4.00808019e-01, 2.99864315e-01, 5.13157895e-01, 8.08080808e-01],
       [4.70633075e-01, 2.02170963e-01, 3.15789474e-01, 8.48484848e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.33333333e-01],
       [4.15215154e-01, 2.56445047e-01, 6.84210526e-01, 8.18181818e-01],
       [8.01006213e-01, 3.77883311e-01, 7.36842105e-01, 9.39393939e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.71717172e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.63636364e-01],
       [4.35263178e-01, 3.56852103e-01, 5.78947368e-01, 8.28282828e-01],
       [2.02919541e-01, 4.88466757e-01, 3.42105263e-01, 6.96969697e-01],
       [1.15714449e-01, 3.60922659e-01, 1.31578947e-01, 6.56565657e-01],
       [7.79776651e-01, 6.05834464e-01, 4.21052632e-01, 9.29292929e-01],
       [3.81141137e-04, 2.71370421e-03, 0.00000000e+00, 3.93939394e-01],
       [1.56267866e-02, 3.93487110e-02, 1.31578947e-02, 5.65656566e-01],
       [7.27979571e-03, 3.05291723e-02, 0.00000000e

## RBF
https://scikit-learn.org/stable/auto_examples/svm/plot_svm_regression.html

In [17]:
C_range = 10.0 ** np.arange(-2, 4)
gamma_range = [.01, .1, 1, 'auto', 10, 100]
print (gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

[0.01, 0.1, 1, 'auto', 10, 100]


{'gamma': [0.01, 0.1, 1, 'auto', 10, 100],
 'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [18]:
grid = GridSearchCV(SVR(kernel='rbf', cache_size=1000), param_grid=param_grid)
grid.fit(X_train_minmax, y_train)
grid.best_estimator_



SVR(C=1000.0, cache_size=1000, coef0=0.0, degree=3, epsilon=0.1, gamma=1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [19]:
svr = SVR(C=1000, gamma=1, kernel='rbf', cache_size=1000,degree=3)
clf_test = svr.fit(X_train_minmax, y_train)
y_pred_minmax_test = svr.predict(X_test_minmax)
print(y_pred_minmax_test)

[ 3.87441522e+02  4.64741427e+02 -2.13109886e-01  4.48500842e+02
  1.00525371e+03 -8.53339118e-03 -1.14401058e-01  4.53147175e+02
  1.30419321e+02  6.12443373e+01  9.85594476e+02  1.47649374e-01
  6.97770824e+00  4.75829448e+00  4.06505953e+00 -2.26523623e-01
 -2.32220451e-01  9.23918366e-01  3.01964836e+02  5.43314959e-02
  1.37602996e-01  2.13476757e+02  1.21750288e+03  2.85792521e+01
 -4.10733014e-02 -1.58323444e-01  9.57245399e+02 -1.99386951e-01
  1.44743578e+00  4.58663137e-02]


In [20]:
#evaluate model using rmse and mae
#rbf mae with minmaxscale
mean_absolute_error(y_test, y_pred_minmax_test, multioutput='raw_values')

array([14.45408246])

In [21]:
#rbf root mean squared error with minmaxscale
mse = mean_squared_error(y_test,y_pred_minmax_test)
rmse= np.sqrt(mse)
rmse

34.29078353839602

In [22]:
svr_rbf = SVR(C=1000, gamma=1, kernel='rbf', cache_size=1000,degree=3)
svr_rbf.fit(X_train,y_train)
y_predict = svr_rbf.predict(X_test)
y_predict

array([215.78888263, 215.78888263, 136.42864444, 215.78888263,
       215.78888263,  73.81124862, 132.50115839, 215.78888263,
       215.78888263, 215.78888263, 215.78888263, 215.76337022,
       215.78888263, 215.78888263, 215.78888263, 208.49761824,
       147.31424713, 215.78887934, 215.78888263,  85.23713002,
        90.74152861, 215.78888263, 215.78888263, 215.78888263,
        73.68610674, 132.50112919, 215.78888263,  83.23967107,
       215.78888263, 215.78888263])

In [23]:
#rbf mae without mixmaxscale
mean_absolute_error(y_test, y_predict, multioutput='raw_values')

array([256.7170244])

In [24]:
#rbf rmse without minmascale
mse = mean_squared_error(y_test,y_predict)
rmse= np.sqrt(mse)
rmse

353.6082277331902

# Linear

In [25]:
C_range = 10.0 ** np.arange(-2, 4)
gamma_range = [.01, .1, 1, 'auto', 10, 100]
print (gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

[0.01, 0.1, 1, 'auto', 10, 100]


{'gamma': [0.01, 0.1, 1, 'auto', 10, 100],
 'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [26]:
grid = GridSearchCV(SVR(kernel='linear', cache_size=1000), param_grid=param_grid)
grid.fit(X_train_minmax, y_train)
grid.best_estimator_



SVR(C=1000.0, cache_size=1000, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [27]:
svr = SVR(C=1000, gamma=0.01, kernel='linear', cache_size=1000,degree=3)
clf_test = svr.fit(X_train_minmax, y_train)
y_pred_minmax_test = svr.predict(X_test_minmax)
print(y_pred_minmax_test)

[ 4.94109987e+02  5.86806173e+02 -3.21471441e+00  5.26869580e+02
  1.01096633e+03 -5.62668026e-01 -3.71197311e+00  5.34180628e+02
  2.06388209e+02  1.00395575e+02  9.37857731e+02 -4.07274985e+00
  8.68591389e+00 -9.55380532e-01 -3.72378605e+00 -3.04896151e+00
 -2.88320861e+00 -4.18976015e+00  4.48899232e+02  5.97602268e-01
  1.92362546e+00  3.25421019e+02  1.18137039e+03  5.56010450e+01
 -8.94173824e-01 -3.54622021e+00  1.00376123e+03 -2.22019702e+00
 -4.10764256e+00 -4.28281907e+00]


In [28]:
#linear kernel mae with minmaxscale
mean_absolute_error(y_test, y_pred_minmax_test, multioutput='raw_values')

array([25.72191591])

In [29]:
#linear kernel rmse with minmaxscale
mse = mean_squared_error(y_test,y_pred_minmax_test)
rmse= np.sqrt(mse)
rmse

44.843479490139195

In [30]:
svr_linear = SVR(C=1000, gamma=0.01, kernel='linear', cache_size=1000,degree=3)

In [31]:
svr_linear.fit(X_train,y_train)

SVR(C=1000, cache_size=1000, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [32]:
y_predict = svr_linear.predict(X_test)
y_predict

array([ 1.17992323e+04,  2.94166531e+04, -3.40649249e+01,  4.35744241e+03,
        3.95370290e+04, -8.03432141e+00, -3.89456630e+01,  1.15841599e+04,
        5.83502368e+02,  3.39226905e+03,  5.33594816e+04, -1.41826182e+01,
        6.69475094e+02,  5.65840827e+02,  3.80563017e+02, -3.24380121e+01,
       -3.08110994e+01,  1.79449468e+01,  1.72055697e+04,  3.35406759e+00,
        1.63693693e+01,  8.58615437e+03,  6.65997294e+04,  1.95379470e+03,
       -1.12881468e+01, -3.73187503e+01,  3.98218131e+04, -2.43034486e+01,
        5.60260285e+01, -3.18219770e+01])

In [33]:
#linear kernel mae without minmaxscale
mean_absolute_error(y_test, y_predict, multioutput='raw_values')

array([9441.25377903])

In [34]:
#linear kernel rmse without minmaxscale
mse = mean_squared_error(y_test,y_predict)
#root mean square
rmse= np.sqrt(mse)
rmse

19560.879858259836

# Polynomial


In [40]:
C_range = 10.0 ** np.arange(-2, 4)
gamma_range = [.01, .1, 1, 'auto', 10, 100]
print (gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

[0.01, 0.1, 1, 'auto', 10, 100]


{'gamma': [0.01, 0.1, 1, 'auto', 10, 100],
 'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [41]:
grid = GridSearchCV(SVR(kernel='poly', cache_size=1000), param_grid=param_grid)
grid.fit(X_train_minmax, y_train)
grid.best_estimator_



SVR(C=1000.0, cache_size=1000, coef0=0.0, degree=3, epsilon=0.1, gamma=1,
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [42]:
svr_poly = SVR(C=1000, gamma=1, kernel='poly', cache_size=1000,degree=3)
clf_test = svr_poly.fit(X_train_minmax, y_train)
y_pred_minmax_test = svr_poly.predict(X_test_minmax)
print(y_pred_minmax_test)

[ 3.79545612e+02  4.25653605e+02  9.22359871e-02  4.43380563e+02
  9.86697927e+02 -7.43365680e-02  1.49787922e-01  4.41368357e+02
  1.44578758e+02  6.07407525e+01  9.12042891e+02  2.39200456e-01
  5.22902684e+00  1.82111712e+00  1.25195797e+00  7.52211461e-02
  5.92371822e-02  4.66973360e-01  2.95234033e+02 -9.53460577e-02
 -1.00672256e-01  2.20236138e+02  1.19436799e+03  2.99807484e+01
 -6.38881993e-02  1.29487160e-01  1.15144030e+03  4.96579916e-03
  6.05681404e-01  1.82221622e-01]


In [43]:
#poly mae with minmaxscale
mean_absolute_error(y_test, y_pred_minmax_test, multioutput='raw_values')

array([17.33847406])

In [44]:
#poly kernel rmse with minmaxscale
mse = mean_squared_error(y_test,y_pred_minmax_test)
rmse= np.sqrt(mse)
rmse

39.40049458335907

# Sigmoid

In [35]:
C_range = 10.0 ** np.arange(-2, 4)
gamma_range = [.01, .1, 1, 'auto', 10, 100]
print (gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

[0.01, 0.1, 1, 'auto', 10, 100]


{'gamma': [0.01, 0.1, 1, 'auto', 10, 100],
 'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [36]:
grid = GridSearchCV(SVR(kernel='sigmoid', cache_size=1000), param_grid=param_grid)
grid.fit(X_train_minmax, y_train)
grid.best_estimator_



SVR(C=1000.0, cache_size=1000, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [37]:
svr_sigmoid = SVR(C=1000, gamma=0.1, kernel='sigmoid', cache_size=1000,degree=3)
clf_test = svr_sigmoid.fit(X_train_minmax, y_train)
y_pred_minmax_test = svr_sigmoid.predict(X_test_minmax)
print(y_pred_minmax_test)

[ 3.97599118e+02  3.60336833e+02 -7.71852147e-01  4.41865926e+02
  6.62734721e+02 -2.27212785e-01 -8.78266203e-01  4.42453886e+02
  2.90471094e+02  1.64821136e+02  6.12411898e+02 -2.45560355e-01
  1.78971582e+01  8.37100093e+00  8.62130853e+00 -7.36734479e-01
 -7.01784943e-01 -6.63222545e-01  3.11703685e+02  2.59663970e-03
  2.62182027e-01  3.24566359e+02  6.86682094e+02  1.00739072e+02
 -2.93599035e-01 -8.42612786e-01  8.14776221e+02 -5.63563493e-01
 -2.31838403e-01 -1.47510729e-02]


In [38]:
#sigmoid kernel mae with minmaxscale
mean_absolute_error(y_test, y_pred_minmax_test, multioutput='raw_values')

array([76.64528414])

In [39]:
#sigmoid kernel rmse with minmaxscale
mse = mean_squared_error(y_test,y_pred_minmax_test)
rmse= np.sqrt(mse)
rmse

151.84385813681763