In [41]:
import numpy as np
import pandas as pd

In [42]:
train = pd.read_csv("originalni_datasetovi/train.csv", parse_dates=['date'])
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB
None


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [43]:
test = pd.read_csv("originalni_datasetovi/test.csv", parse_dates=['date'])
print(test.info())
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromotion  28512 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 1.1+ MB
None


Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [44]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

NO_FAMILIES = train.family.nunique()
NO_STORES = train.store_nbr.nunique()

0
0


In [45]:
# Missing dates

# number of days that should be in train set
date_min_train = train.date.min()
date_max_train = train.date.max()
nbr_days_train = (date_max_train-date_min_train).days+1
print("Number of days that should be in train set: ",nbr_days_train)
print("Number of days that are in train: ", train.date.nunique())

# number of days that should be in test set
date_min_test = test.date.min()
date_max_test = test.date.max()
nbr_days_test = (date_max_test-date_min_test).days+1
print("Number of days that should be in test set: ",nbr_days_test)
print("Number of days that are in test: ", test.date.nunique())

Number of days that should be in train set:  1688
Number of days that are in train:  1684
Number of days that should be in test set:  16
Number of days that are in test:  16


In [46]:
# there are 4 missing dates in train set

missing_dates = pd.date_range(date_min_train, date_max_train).difference(train.date.unique())
missing_dates

DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]', freq=None)

In [47]:
# cheching is there any other date that is missing data for some stores
nbr_stores = 54
nbr_families=33
# number of rows that train should have
print("number of rows that train should have: ",nbr_days_train*nbr_stores*nbr_families)
print("number of rows train has: ", train.shape[0])
print("number of rows train will have after adding christmas: ", train.shape[0]+4*nbr_families*nbr_stores)

number of rows that train should have:  3008016
number of rows train has:  3000888
number of rows train will have after adding christmas:  3008016


In [48]:
# adding missing dates

new_index = pd.MultiIndex.from_product([pd.date_range(date_min_train, date_max_train), train.store_nbr.unique(), train.family.unique()],names=['date', 'store_nbr', 'family'])
train = train.set_index(['date','store_nbr','family'])
train = train.reindex(new_index).reset_index()
print("Number of days that are in train: ", train.date.nunique())

Number of days that are in train:  1688


In [49]:
train.isna().sum()

date              0
store_nbr         0
family            0
id             7128
sales          7128
onpromotion    7128
dtype: int64

In [50]:
train[["sales", "onpromotion"]] = train[["sales", "onpromotion"]].fillna(0.)

In [51]:
train.isna().sum()

date              0
store_nbr         0
family            0
id             7128
sales             0
onpromotion       0
dtype: int64

In [52]:
#dealing with holidays
holidays = pd.read_csv("originalni_datasetovi/holidays_events.csv", parse_dates=['date'])
#TYPE: transferred - not acc celebrated on that day

transferred_holidays = holidays[(holidays.type == "Holiday") & (holidays.transferred == True)].drop("transferred", axis = 1).reset_index(drop = True) #holidays that were transferred
transfer = holidays[(holidays.type == "Transfer")].drop("transferred", axis = 1).reset_index(drop = True) #days that they were transferred to (in ds they are always below the actual holiday so indexes are going to match)
tr = pd.concat([transferred_holidays,transfer], axis = 1) 
tr = tr.iloc[:, [5,1,2,3,4]] #getting rid of dupplicate columns, keeping date from transferred, everything else from transferred_holidays
tr

Unnamed: 0,date,type,locale,locale_name,description
0,2012-10-12,Holiday,National,Ecuador,Independencia de Guayaquil
1,2013-10-11,Holiday,National,Ecuador,Independencia de Guayaquil
2,2014-10-10,Holiday,National,Ecuador,Independencia de Guayaquil
3,2016-05-27,Holiday,National,Ecuador,Batalla de Pichincha
4,2016-07-24,Holiday,Local,Guayaquil,Fundacion de Guayaquil
5,2016-08-12,Holiday,National,Ecuador,Primer Grito de Independencia
6,2017-01-02,Holiday,National,Ecuador,Primer dia del ano
7,2017-04-13,Holiday,Local,Cuenca,Fundacion de Cuenca
8,2017-05-26,Holiday,National,Ecuador,Batalla de Pichincha
9,2017-08-11,Holiday,National,Ecuador,Primer Grito de Independencia


In [53]:
# mi smo dodelili holidayima koji su bili tranferovani datum od odgovarajuceg transfer dogadjaja
# jer se tad i jesu odrzali
# tkd pise datum od transfera u prazniku
# a transfer je obrisan


#deleting transferred holidays and transfer days from holidays and we don't need transferred column anymore
holidays = holidays[(holidays.transferred==False) & (holidays.type !='Transfer')].drop('transferred',axis=1)

#adding concated transferred holidays and transfer days
holidays = pd.concat([holidays,tr]).reset_index(drop=True)

holidays.head()

Unnamed: 0,date,type,locale,locale_name,description
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba


In [54]:
import re
# TYPE: Additional 

# have the same name as regular holiday but with + or - number 
# removing +-number
def clean_description(desc):
    desc = re.sub(r'[+-]', '', desc)  # Remove + and -
    desc = re.sub(r'\d+', '', desc)  # Remove digits
    return desc

holidays["description"] = holidays["description"].apply(clean_description)

# TYPE: Bridge

# have the same name as regular holiday but with Puente at the beginning
# removing puente
holidays["description"] = holidays["description"].str.replace("Puente ", "")

In [55]:
# TYPE: Work Day

# work days are not actually holidays, they are just days that you wouldn't regularly work on but now you do (saturdays)
# removing them
work_days=holidays[holidays['type']=='Work Day']
holidays=holidays[holidays['type']!='Work Day']

# TYPE: Event

# "There are many football events referred to by the match name that include the word 'futbal'
holidays.loc[holidays["description"].str.contains("futbol"), "description"] = "Futbol"

In [56]:
# Splitting by locale

# Local holidays - city level
local_holidays=holidays[holidays['locale']=='Local']
local_holidays=local_holidays.rename(columns={'locale_name':'city','description': 'local_holidays'})
local_holidays=local_holidays.reset_index(drop=True)
local_holidays.drop(columns=['locale','type'],inplace=True)
local_holidays = local_holidays.drop_duplicates()
local_holidays['local_holidays'] = local_holidays['local_holidays'].apply(lambda x: 'L ' + x)

# Regional holidays
regional_holidays=holidays[holidays['locale']=='Regional']
regional_holidays=regional_holidays.rename(columns={'locale_name':'state', 'description':'regional_holidays'})
regional_holidays=regional_holidays.reset_index(drop=True)
regional_holidays.drop(columns=['locale','type'],inplace=True)
regional_holidays = regional_holidays.drop_duplicates()
regional_holidays['regional_holidays'] = regional_holidays['regional_holidays'].apply(lambda x: 'R ' + x)

# National holidays and events
national_holidays=holidays[holidays['locale']=='National']
national_holidays=national_holidays.rename(columns={'description':'national_holidays'})
national_holidays=national_holidays.reset_index(drop=True)
national_holidays.drop(columns=['locale','type','locale_name'],inplace=True)
national_holidays = national_holidays.drop_duplicates()
national_holidays['national_holidays'] = national_holidays['national_holidays'].apply(lambda x: 'N ' + x)

In [57]:
# fixing oil dataset
oil=pd.read_csv("originalni_datasetovi/oil.csv", parse_dates=['date'])
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [58]:
# renaming column for oil price
oil = oil.rename(columns={'dcoilwtico':'oil_price'})

In [59]:
#Missing dates

#number of days that should be in oil set
date_min_oil = oil.date.min()
date_max_oil = oil.date.max()
nbr_days_oil = (date_max_oil-date_min_oil).days+1
print("Number of days that should be in oil set: ",nbr_days_oil)
print("Number of days that are in oil: ", oil.date.nunique())

Number of days that should be in oil set:  1704
Number of days that are in oil:  1218


In [60]:
import datetime

# checking for missing dates in dataset
missing_oil_dates=pd.date_range(date_min_train, date_max_test).difference(oil.date)
missing_oil_days=[]
for date in missing_oil_dates:
    day_name = datetime.date.strftime(date, '%A')
    missing_oil_days.append(day_name)

print(set(missing_oil_days)) 

{'Saturday', 'Sunday'}


In [61]:
# checking for NaN values 
oil.isna().sum()

date          0
oil_price    43
dtype: int64

In [62]:
# adding missing dates

new_index_oil = pd.date_range(date_min_oil, date_max_oil)
oil = oil.set_index(['date'])
oil = oil.reindex(new_index_oil).reset_index()
oil.head()

Unnamed: 0,index,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,


In [63]:
# checking for NaN values 
oil.isna().sum()

index          0
oil_price    529
dtype: int64

In [64]:
#filling missing values with lin interpolation
oil['oil_price'] = oil['oil_price'].interpolate(method='linear', limit_direction='both')
oil.head()

Unnamed: 0,index,oil_price
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,93.146667


In [65]:
print(date_min_oil)
print(date_max_oil)
print('-------------')
print(date_min_train)
print(date_max_test)

2013-01-01 00:00:00
2017-08-31 00:00:00
-------------
2013-01-01 00:00:00
2017-08-31 00:00:00


In [66]:
# fixing transactions dataset
transactions=pd.read_csv(r'originalni_datasetovi\transactions.csv', parse_dates=['date'])
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [67]:
transactions.date.nunique()

1682

In [68]:
# Missing dates

# number of days that should be in transactions set
date_min_trans=transactions.date.min()
date_max_trans=transactions.date.max()
nbr_days_trans=(date_max_trans-date_min_trans).days+1
print("Number of days that should be in transactions set: ",nbr_days_trans)
print("Number of days that are in transactions: ", transactions.date.nunique())

Number of days that should be in transactions set:  1688
Number of days that are in transactions:  1682


In [69]:
print(date_min_trans)
print(date_max_trans)
print('-------------')
print(date_min_train)
print(date_max_train)

2013-01-01 00:00:00
2017-08-15 00:00:00
-------------
2013-01-01 00:00:00
2017-08-15 00:00:00


In [70]:
missing_dates_trans = pd.date_range(date_min_trans, date_max_trans).difference(transactions.date)
missing_dates_trans

DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-01-01',
               '2016-01-03', '2016-12-25'],
              dtype='datetime64[ns]', freq=None)

In [71]:
print("number of rows that transactions should have: ",nbr_stores*nbr_days_train)
print("number of rows transactions have ",transactions.shape[0])
# there are missing entries for some stores

number of rows that transactions should have:  91152
number of rows transactions have  83488


In [72]:
# checking for NaN values beside that
transactions.isna().sum()

date            0
store_nbr       0
transactions    0
dtype: int64

In [73]:
# adding 0s for transactions where sum of sales for that store is 0

# sum od sales by date and store
store_sales = train.groupby(["date", "store_nbr"]).sales.sum().reset_index()
transactions = transactions.merge(store_sales,on=["date", "store_nbr"], how="outer",).sort_values(["date", "store_nbr"], ignore_index=True)

transactions.loc[transactions.sales.eq(0), "transactions"] = 0.
transactions = transactions.drop(columns=["sales"])

In [74]:
print("number of rows transactions have ",transactions.shape[0])

number of rows transactions have  91152


In [75]:
# checking for NaN values beside that
transactions.isna().sum()

date              0
store_nbr         0
transactions    118
dtype: int64

In [76]:
# filling missing values that are not caused bu 0 sales with lin int
transactions.transactions = transactions.groupby("store_nbr", group_keys=False).transactions.apply(
    lambda x: x.interpolate(method="linear", limit_direction="both"))

In [77]:
transactions.isna().sum()

date            0
store_nbr       0
transactions    0
dtype: int64

In [78]:
# temp putting NaN for every 1.1.
train.loc[(train['date'].dt.month == 1) & (train['date'].dt.day == 1) & (train['sales']==0), ['sales', 'onpromotion']] = np.nan
train.loc[(train['date'].dt.month == 12) & (train['date'].dt.day == 25) & (train['sales']==0), ['sales', 'onpromotion']] = np.nan

In [79]:
# concating train and test
data = pd.concat([train, test], axis=0, ignore_index=True,)

In [80]:
data = data.merge(transactions, on=["date", "store_nbr"], how="left")

In [81]:
data.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0
1,2013-01-01,1,BABY CARE,1.0,,,0.0
2,2013-01-01,1,BEAUTY,2.0,,,0.0
3,2013-01-01,1,BEVERAGES,3.0,,,0.0
4,2013-01-01,1,BOOKS,4.0,,,0.0


In [82]:
oil.head()

Unnamed: 0,index,oil_price
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,93.146667


In [83]:
oil = oil.rename(columns={'index':'date'})
data = data.merge(oil, on="date", how="left")

In [84]:
data.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14


In [85]:
#Adding holidays data

#work days
work_days = work_days[['date','type']].rename(columns={'type':'work_day'}).reset_index(drop=True)
work_days.work_day=work_days.work_day.notna().astype(int)
df = pd.merge(data,work_days, how='left', on='date')
df['work_day'] = df['work_day'].fillna(0).astype(int)
df.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price,work_day
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14,0
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14,0
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14,0
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14,0
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14,0


In [86]:
#local holidays
#there could be multiple holidays on the same day so one hot encoding doesn't work
local_holidays_wide = local_holidays.pivot_table(index='date', columns='local_holidays', aggfunc='size', fill_value=0)
local_holidays_wide = local_holidays_wide.astype(int)
local_holidays_wide = local_holidays_wide.reset_index() #to return date as a column
df=pd.merge(df,local_holidays_wide, how='left', on=['date'])
for col in local_holidays_wide.columns:
    if col != 'date':
        df[col] = df[col].fillna(0).astype(int)
df.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price,work_day,L Cantonizacion de Cayambe,...,L Fundacion de Ibarra,L Fundacion de Loja,L Fundacion de Machala,L Fundacion de Manta,L Fundacion de Quito,L Fundacion de Riobamba,L Fundacion de Santo Domingo,L Independencia de Ambato,L Independencia de Guaranda,L Independencia de Latacunga
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
#regional holidays
regional_holidays_wide = regional_holidays.pivot_table(index='date', columns='regional_holidays', aggfunc='size', fill_value=0)
regional_holidays_wide = regional_holidays_wide.astype(int)
regional_holidays_wide = regional_holidays_wide.reset_index() #to return date as a column
df=pd.merge(df,regional_holidays_wide, how='left', on=['date'])
for col in regional_holidays_wide.columns:
    if col != 'date':
        df[col] = df[col].fillna(0).astype(int)
df.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price,work_day,L Cantonizacion de Cayambe,...,L Fundacion de Quito,L Fundacion de Riobamba,L Fundacion de Santo Domingo,L Independencia de Ambato,L Independencia de Guaranda,L Independencia de Latacunga,R Provincializacion Santa Elena,R Provincializacion de Cotopaxi,R Provincializacion de Imbabura,R Provincializacion de Santo Domingo
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
#national holidays
national_holidays_wide = national_holidays.pivot_table(index='date', columns='national_holidays', aggfunc='size', fill_value=0)
national_holidays_wide = national_holidays_wide.astype(int)
national_holidays_wide = national_holidays_wide.reset_index() #to return date as a column
df=pd.merge(df,national_holidays_wide, how='left', on=['date'])
for col in national_holidays_wide.columns:
    if col != 'date':
        df[col] = df[col].fillna(0).astype(int)
df.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price,work_day,L Cantonizacion de Cayambe,...,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer Grito de Independencia,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14,0,0,...,0,0,0,0,0,0,0,1,0,0


In [92]:
stores=pd.read_csv(r'originalni_datasetovi\stores.csv')
#Adding stores data
df=pd.merge(df,stores)
df.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,oil_price,work_day,L Cantonizacion de Cayambe,...,N Independencia de Guayaquil,N Navidad,N Primer Grito de Independencia,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.0,,,0.0,93.14,0,0,...,0,0,0,1,0,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,1.0,,,0.0,93.14,0,0,...,0,0,0,1,0,0,Quito,Pichincha,D,13
2,2013-01-01,1,BEAUTY,2.0,,,0.0,93.14,0,0,...,0,0,0,1,0,0,Quito,Pichincha,D,13
3,2013-01-01,1,BEVERAGES,3.0,,,0.0,93.14,0,0,...,0,0,0,1,0,0,Quito,Pichincha,D,13
4,2013-01-01,1,BOOKS,4.0,,,0.0,93.14,0,0,...,0,0,0,1,0,0,Quito,Pichincha,D,13


In [93]:
df.columns

Index(['date', 'store_nbr', 'family', 'id', 'sales', 'onpromotion',
       'transactions', 'oil_price', 'work_day', 'L Cantonizacion de Cayambe',
       'L Cantonizacion de El Carmen', 'L Cantonizacion de Guaranda',
       'L Cantonizacion de Latacunga', 'L Cantonizacion de Libertad',
       'L Cantonizacion de Quevedo', 'L Cantonizacion de Riobamba',
       'L Cantonizacion de Salinas', 'L Cantonizacion del Puyo',
       'L Fundacion de Ambato', 'L Fundacion de Cuenca',
       'L Fundacion de Esmeraldas', 'L Fundacion de Guayaquil',
       'L Fundacion de Ibarra', 'L Fundacion de Loja',
       'L Fundacion de Machala', 'L Fundacion de Manta',
       'L Fundacion de Quito', 'L Fundacion de Riobamba',
       'L Fundacion de Santo Domingo', 'L Independencia de Ambato',
       'L Independencia de Guaranda', 'L Independencia de Latacunga',
       'R Provincializacion Santa Elena', 'R Provincializacion de Cotopaxi',
       'R Provincializacion de Imbabura',
       'R Provincializacion de Sa

In [94]:
df.to_csv("after_prep.csv")