In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata

## Generating the NUTS version of the table
Load generated tsv file from the eurostat website <br>
Replace colons (':') with NaN <br>
Split the first column into three <br>
Move them to the front <br>
Dispose of the original column <br>

In [5]:
df = pd.read_csv('Input Data/crop_prod_nuts_data.tsv', sep='\t', header=0, encoding="ISO-8859-1")
df = df.replace(":", np.nan)
df[['shapeId','comdty','param']]=df['GEO,CROPS,STRUCPRO\TIME'].str.rsplit(",",2,expand=True)
df=df[['shapeId','comdty','param','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021']]

**Unicodedata - library used to latinize strings** <br>
Function takes string as input <br>
Applies normalize (latinize) method to it <br>
Encodes as ASCII while dropping error characters if any persist <br>
Returns an utf-8 decoded string that has no non-ascii characters <br>
Use the function to remove special charaters from the 'shapeId' column

In [6]:
def remove_accents(input_str):
    form=unicodedata.normalize('NFKD',input_str)
    only_ascii=form.encode('ASCII','ignore')
    return only_ascii.decode('utf-8')
df['shapeId']=df['shapeId'].apply(remove_accents)

* Split the dataframe into two
* Create a dictionary of dataframes for each year in the table
* Iterate over every year in the dataframe
* Concatenate its' column values with the dataframe that consists of param and comdty attributes 
* Rename the **'year'** column into **'value'**
* Create a date column and add the corresponding date to it

In [7]:
df_main=df.iloc[:,:3]
df_year = df.iloc[:,3:]
df_date_merge={}
for (columnName, columnData) in df_year.iteritems():
    df_date_merge[columnName]=pd.concat([df_main,df_year[columnName]],axis=1)
    df_date_merge[columnName]=df_date_merge[columnName].rename(columns={columnName:'value'})
    df_date_merge[columnName].insert(loc=0,column='date',value=columnName+'-01-01 00:00:00')   

Create empty data frame that will house all yearly dataframes <br>
Iterate over each dataframe in the dictionary <br> 
Concatenate it to the final dataframe

In [8]:
df = pd.DataFrame()
for key,frame in df_date_merge.items():
    df=pd.concat([df,frame],ignore_index=True,sort=False)

Change **'date'** columns' datatype to **'datetime'** <br>
Remove all non digit elements from the value column except periods ('.')

In [9]:
df['date']=pd.to_datetime(df['date'])
df['value'].replace(regex=True, inplace=True, to_replace=r'[^\d.]',  value=r'')

In [10]:
df['value']=pd.to_numeric(df['value'],errors='coerce')

In [11]:
df.insert(loc=1, column='dateRelease', value='2021-09-23 00:00:00')
df['dateRelease']=pd.to_datetime(df['dateRelease'])

Write the DataFrame to a csv and tsv file

In [13]:
df.to_csv('Output Data/crop_prod_eu_stand_humidity_NUTS.csv')
df.to_csv('Output Data/crop_prod_eu_stand_humidity_NUTS.tsv')