In [1]:
import numpy as np
import pandas as pd
import calendar
import matplotlib.pyplot as plt
import re

In [2]:
from segmentation import draw_plot, draw_segments
from segmentation import sliding_window, bottomupsegment, swab
from segmentation import sumsquared_error, regression, compute_error

In [3]:
universal_error = 2.75
swab_buffer_percent = 0.15

### COVID19

In [4]:
#Import Covid Dataset

ds_covid = pd.read_csv("Data/COVID/owid-covid-data.csv")
#If missing values: Interpolate: ds_covid = ds_covid.interpolate(method='nearest')
ds_covid = ds_covid.fillna(0)
ds_covid.date = pd.to_datetime(ds_covid.date)
ds_covid['month'] = pd.DatetimeIndex(ds_covid['date']).month
ds_covid['month'] = ds_covid['month'].apply(lambda x: calendar.month_name[x])
ds_covid['year'] = pd.DatetimeIndex(ds_covid['date']).year
ds_covid.set_index(['date'],inplace=True)

In [5]:
sw_covid = []
bu_covid = []
swab_covid = []


countries = ['USA', 'IND', 'BRA', 'RUS', 'GBR', 'FRA', 'ESP', 'ITA' , 'TUR',  'DEU']
for iso in countries:
    
    print("Processing Country: ", iso)
    
    country = ds_covid[ds_covid['iso_code']==iso][['new_cases','month', 'year']].reset_index().drop(columns=['date'])
    country_cases_raw = country['new_cases'].tolist()

    #Log-normalize data
    trans = np.ma.log(country_cases_raw)
    country_cases = trans.filled(0)
    
    sliding_window_data = sliding_window(country_cases, max_error = universal_error)
    print("Sliding Window Done")
    bottom_up_data = bottomupsegment(country_cases, regression, sumsquared_error, max_error=universal_error)
    print("Bottom Up Done")
    swab_data = swab(country_cases, buffer_percent=swab_buffer_percent, bottom_up_error = universal_error, best_line_error = universal_error)
    print("SWAB Done")
    sw_error = compute_error(country_cases, sliding_window_data)
    bu_error = compute_error(country_cases, bottom_up_data)
    swab_error = compute_error(country_cases, swab_data)
    print("Errors Computed")
    sw_covid.append(sw_error)
    bu_covid.append(bu_error)
    swab_covid.append(swab_error)

Processing Country:  USA
Sliding Window Done


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  IND
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  BRA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  RUS
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  GBR
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  FRA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  ESP
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  ITA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  TUR
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  DEU
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed


In [21]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[0] for x in sw_covid]))
print("BU: ", np.mean([x[0] for x in bu_covid]))
print("SWAB: ", np.mean([x[0] for x in swab_covid]))

Total SSE of Residuals
SW:  300.6186009605213
BU:  27.079123255475583
SWAB:  27.16266057100004


In [22]:
print("r-squared fit")
print("SW: ", np.mean([x[2] for x in sw_covid]))
print("BU: ", np.mean([x[2] for x in bu_covid]))
print("SWAB: ", np.mean([x[2] for x in swab_covid]))

r-squared fit
SW:  0.12462149425871576
BU:  0.13992916183849033
SWAB:  0.14075009400182664


### DOTS

In [8]:
#Import DOTS Dataset
ds_dots = pd.read_csv("Data/DOTS/Exports.csv")

In [9]:
sw_dots = []
bu_dots = []
swab_dots = []

countries = ['United States', 'India', 'Brazil', 'USSR', 'United Kingdom', 'France', 'Spain', 'Italy' , 'Turkey', 'Germany']

for iso in countries:
    
    #Load and Process Data
    country = ds_dots.loc[ds_dots['Location'] == iso]
    time = country.columns.tolist()[1:]
    for row in country.iterrows():
        values = row[1]
    values = [str(x) for x in values]
    values = [float(re.sub(',', '', x)) for x in values[1:]]
    time = [re.sub('M', '-', x) for x in time]
    country = pd.DataFrame(list(zip(time, values)), columns = ['Date', 'Exports'])
    
    country = country.fillna(0)
    country.Date = pd.to_datetime(country.Date)
    country['month'] = pd.DatetimeIndex(country['Date']).month
    country['month'] = country['month'].apply(lambda x: calendar.month_name[x])
    country['year'] = pd.DatetimeIndex(country['Date']).year
    country.set_index(['Date'],inplace=True)
    
    country = country[['Exports','month', 'year']].reset_index().drop(columns=['Date'])
    country_exports_raw = country['Exports'].tolist()

    #Log-normalize data
    trans = np.ma.log(country_exports_raw)
    country_exports = trans.filled(0)
    
    sliding_window_data = sliding_window(country_exports, max_error = universal_error)
    print("Sliding Window Done")
    bottom_up_data = bottomupsegment(country_exports, regression, sumsquared_error, max_error=universal_error)
    print("Bottom Up Done")
    swab_data = swab(country_exports, buffer_percent=swab_buffer_percent, bottom_up_error = universal_error, best_line_error = universal_error)
    print("SWAB Done")
    sw_error = compute_error(country_exports, sliding_window_data)
    bu_error = compute_error(country_exports, bottom_up_data)
    swab_error = compute_error(country_exports, swab_data)
    print("Errors Computed")
    sw_dots.append(sw_error)
    bu_dots.append(bu_error)
    swab_dots.append(swab_error)

Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed


In [23]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[0] for x in sw_dots]))
print("BU: ", np.mean([x[0] for x in bu_dots]))
print("SWAB: ", np.mean([x[0] for x in swab_dots]))

Total SSE of Residuals
SW:  6.914029279466287
BU:  4.989463640542703
SWAB:  4.900291009505039


In [24]:
print("r-squared fit")
print("SW: ", np.mean([x[2] for x in sw_dots]))
print("BU: ", np.mean([x[2] for x in bu_dots]))
print("SWAB: ", np.mean([x[2] for x in swab_dots]))

r-squared fit
SW:  0.08426948852609516
BU:  0.07620623427899338
SWAB:  0.08930809921017868


### Pollution

In [12]:
#Import US Pollution Dataset
ds_poll = pd.read_csv("Data/USPollution/USPollution.csv")
ds_poll = ds_poll.dropna()
ds_poll['Date Local'] = pd.to_datetime(ds_poll['Date Local'])
ds_poll['month'] = pd.DatetimeIndex(ds_poll['Date Local']).month
ds_poll['month'] = ds_poll['month'].apply(lambda x: calendar.month_name[x])
ds_poll['year'] = pd.DatetimeIndex(ds_poll['Date Local']).year
ds_poll.set_index(['Date Local'],inplace=True)

In [12]:
sw_poll = []
bu_poll = []
swab_poll = []

for i in [1,2,5,8,9,10,11,12,13,15]:
    
    location = ds_poll[ds_poll['State Code']==i]['State'].iloc[[1]][0]
    iso = location
    
    country = ds_poll[ds_poll['State Code']==i][['CO Mean','month', 'year']].reset_index().drop(columns=['Date Local'])
    country_poll = country['CO Mean'].tolist()
    
    print("Processing Country: ", iso)
    
    #Log-normalize data
    trans = np.ma.log(country_poll)
    country_poll = trans.filled(0)
    
    sliding_window_data = sliding_window(country_poll, max_error = universal_error)
    print("Sliding Window Done")
    bottom_up_data = bottomupsegment(country_poll, regression, sumsquared_error, max_error=universal_error)
    print("Bottom Up Done")
    swab_data = swab(country_poll, buffer_percent=swab_buffer_percent, bottom_up_error = universal_error, best_line_error = universal_error)
    print("SWAB Done")
    sw_error = compute_error(country_poll, sliding_window_data)
    bu_error = compute_error(country_poll, bottom_up_data)
    swab_error = compute_error(country_poll, swab_data)
    print("Errors Computed")
    sw_poll.append(sw_error)
    bu_poll.append(bu_error)
    swab_poll.append(swab_error)

Processing Country:  Alabama
Sliding Window Done


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  Alaska
Sliding Window Done


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  Arkansas


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Sliding Window Done


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/mandarsharma/trans/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-cb370887fdf6>", line 21, in <module>
    bottom_up_data = bottomupsegment(country_poll, regression, sumsquared_error, max_error=universal_error)
  File "/home/mandarsharma/Segmentation/segmentation.py", line 89, in bottomupsegment
    mergedsegments = [create_segment(sequence,(seg1[0],seg2[2])) for seg1,seg2 in zip(segments[:-1],segments[1:])]
  File "/home/mandarsharma/Segmentation/segmentation.py", line 89, in <listcomp>
    mergedsegments = [create_segment(sequence,(seg1[0],seg2[2])) for seg1,seg2 in zip(segments[:-1],segments[1:])]
  File "/home/mandarsharma/Segmentation/segmentation.py", line 41, in regression
    p, error = leastsquareslinefit(sequence,seq_range)
  File "/home/mandarsharma/Segmentation/segmentation.py", line 28, in leastsquareslinefi

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/mandarsharma/trans/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-cb370887fdf6>", line 21, in <module>
    bottom_up_data = bottomupsegment(country_poll, regression, sumsquared_error, max_error=universal_error)
  File "/home/mandarsharma/Segmentation/segmentation.py", line 89, in bottomupsegment
    mergedsegments = [create_segment(sequence,(seg1[0],seg2[2])) for seg1,seg2 in zip(segments[:-1],segments[1:])]
  File "/home/mandarsharma/Segmentation/segmentation.py", line 89, in <listcomp>
    mergedsegments = [create_segment(sequence,(seg1[0],seg2[2])) for seg1,seg2 in zip(segments[:-1],segments[1:])]
  File "/home/mandarsharma/Segmentation/segmentation.py", line 41, in regression
    p, error = leastsquareslinefit(sequence,seq_range)
  File "/home/mandarsharma/Segmentation/segmentation.py", line 28, in leastsquareslinefi

TypeError: object of type 'NoneType' has no len()

In [25]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[0] for x in sw_poll]))
print("BU: ", np.mean([x[0] for x in bu_poll]))
print("SWAB: ", np.mean([x[0] for x in swab_poll]))

Total SSE of Residuals
SW:  73.91283930957627
BU:  67.53944566644341
SWAB:  67.11458746910141


In [26]:
print("r-squared fit")
print("SW: ", np.mean([x[2] for x in sw_poll]))
print("BU: ", np.mean([x[2] for x in bu_poll]))
print("SWAB: ", np.mean([x[2] for x in swab_poll]))

r-squared fit
SW:  0.15241795562537705
BU:  0.16333281843843664
SWAB:  0.16146632775940198


### Population

In [16]:
#Import Population Dataset
ds_pop = pd.read_csv("Data/Population/Pop.csv")
ds_pop = ds_pop.dropna()

In [18]:
sw_pop = []
bu_pop = []
swab_pop = []

countries = ['USA', 'IND', 'BRA', 'GBR', 'FRA', 'ESP', 'ITA' , 'TUR',  'DEU']

for c in countries:
    
    print("Processing Country: ", c)
    
    country = ds_pop[ds_pop['Code']==c][['Population by Country (Clio Infra (2016))','Year']].reset_index().drop(columns=['index'])
    country_pop_raw = country['Population by Country (Clio Infra (2016))'].tolist()

    #Log-normalize data
    trans = np.ma.log(country_pop_raw)
    country_pop = trans.filled(0)
    
    sliding_window_data = sliding_window(country_pop, max_error = universal_error)
    print("Sliding Window Done")
    bottom_up_data = bottomupsegment(country_pop, regression, sumsquared_error, max_error=universal_error)
    print("Bottom Up Done")
    swab_data = swab(country_pop, buffer_percent=swab_buffer_percent, bottom_up_error = universal_error, best_line_error = universal_error)
    print("SWAB Done")
    sw_error = compute_error(country_pop, sliding_window_data)
    bu_error = compute_error(country_pop, bottom_up_data)
    swab_error = compute_error(country_pop, swab_data)
    print("Errors Computed")
    sw_pop.append(sw_error)
    bu_pop.append(bu_error)
    swab_pop.append(swab_error)


Processing Country:  USA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  IND
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  BRA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  GBR
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  FRA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  ESP
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  ITA
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  TUR
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed
Processing Country:  DEU
Sliding Window Done
Bottom Up Done
SWAB Done
Errors Computed


In [27]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[0] for x in sw_pop]))
print("BU: ", np.mean([x[0] for x in bu_pop]))
print("SWAB: ", np.mean([x[0] for x in swab_pop]))

Total SSE of Residuals
SW:  0.9231444360565619
BU:  0.7582257038563937
SWAB:  0.11423230025743493


In [28]:
print("r-squared fit")
print("SW: ", np.mean([x[2] for x in sw_pop]))
print("BU: ", np.mean([x[2] for x in bu_pop]))
print("SWAB: ", np.mean([x[2] for x in swab_pop]))

r-squared fit
SW:  0.6177105480350316
BU:  0.645641667600306
SWAB:  0.37785434340708235


### Global Temperature

In [19]:
#Import Land Temp Dataset
ds_gtemp = pd.read_csv("Data/GlobalTemperature/GlobalLandTemperaturesByCountry.csv")
ds_gtemp = ds_gtemp.dropna()
ds_gtemp['dt'] = pd.to_datetime(ds_gtemp['dt'])
ds_gtemp['month'] = pd.DatetimeIndex(ds_gtemp['dt']).month
ds_gtemp['month'] = ds_gtemp['month'].apply(lambda x: calendar.month_name[x])
ds_gtemp['year'] = pd.DatetimeIndex(ds_gtemp['dt']).year
ds_gtemp.set_index(['dt'],inplace=True)

In [20]:
sw_temp = []
bu_temp = []
swab_temp = []

countries = ['United States', 'India', 'Brazil', 'Russia', 'United Kingdom', 'France', 'Spain', 'Italy' , 'Turkey', 'Germany']

for c in countries:
    
    print("Processing Country: ", c)
    
    country = ds_gtemp[ds_gtemp['Country']==c][['AverageTemperature','month', 'year']].reset_index().drop(columns=['dt'])
    country_gtemp_raw = country['AverageTemperature'].tolist()
    
    #Log-normalize data
    trans = np.ma.log(country_gtemp_raw)
    country_gtemp = trans.filled(0)
    
    sliding_window_data = sliding_window(country_gtemp, max_error = universal_error)
    print("Sliding Window Done")
    bottom_up_data = bottomupsegment(country_gtemp, regression, sumsquared_error, max_error=universal_error)
    print("Bottom Up Done")
    swab_data = swab(country_gtemp, buffer_percent=swab_buffer_percent, bottom_up_error = universal_error, best_line_error = universal_error)
    print("SWAB Done")
    sw_error = compute_error(country_gtemp, sliding_window_data)
    bu_error = compute_error(country_gtemp, bottom_up_data)
    swab_error = compute_error(country_gtemp, swab_data)
    print("Errors Computed")
    sw_pop.append(sw_error)
    bu_pop.append(bu_error)
    swab_pop.append(swab_error)


Processing Country:  United States
Sliding Window Done

  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



Bottom Up Done


KeyboardInterrupt: 

In [29]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[0] for x in sw_gtemp]))
print("BU: ", np.mean([x[0] for x in bu_gtemp]))
print("SWAB: ", np.mean([x[0] for x in swab_gtemp]))

Total SSE of Residuals


NameError: name 'sw_gtemp' is not defined

In [None]:
print("Total SSE of Residuals")
print("SW: ", np.mean([x[2] for x in sw_gtemp]))
print("BU: ", np.mean([x[2] for x in bu_gtemp]))
print("SWAB: ", np.mean([x[2] for x in swab_gtemp]))