## Clean data

Part of the cleaning was done in spreadsheet software. It was just more convenient.

## 1. Importing all packages

In [31]:
# External
import pandas as pd

## 2. Euro rates

### 2.1 Importing  a CSV file

In [32]:
df = pd.read_csv("data\RAW\_Socioeconomic data\euro_rates_03_04_2023.csv")
df.head()

Unnamed: 0,Currency,USD,JPY,BGN,CZK,DKK,GBP,HUF,PLN,RON,...,ILS,INR,KRW,MXN,MYR,NZD,PHP,SGD,THB,ZAR
0,Rate,1.087,144.51,1.9558,23.44,7.4504,0.8779,378.84,4.6765,4.9367,...,3.9055,89.471,1429.43,19.5519,4.8035,1.7283,59.383,1.4464,37.306,19.4208


In [33]:
df.shape

(1, 31)

### 2.2 Reshape

In [34]:
df = pd.melt(
    df,
    var_name='Currency',
    value_name='Rate',
    ignore_index=True
)
df.set_index('Currency', inplace=True)

df = df.iloc[1: , :]

df.shape

(30, 1)

In [35]:
df

Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.087
JPY,144.51
BGN,1.9558
CZK,23.44
DKK,7.4504
GBP,0.8779
HUF,378.84
PLN,4.6765
RON,4.9367
SEK,11.2835


### 2.2 Remove duplicates

In [36]:
df = df.drop_duplicates()
df.shape

(30, 1)

### 2.3 Remove NaNs

In [37]:
df = df.dropna(how="all")
df.shape

(30, 1)

In [38]:
df = df.dropna(axis=1, how="all")
df.shape

(30, 1)

### 2.4 Preview

In [39]:
df

Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.087
JPY,144.51
BGN,1.9558
CZK,23.44
DKK,7.4504
GBP,0.8779
HUF,378.84
PLN,4.6765
RON,4.9367
SEK,11.2835


In [40]:
df.shape

(30, 1)

### 2.5 Save CSV

##### 2.5.1 Save

In [41]:
file_to_save = "data\clean\_Socioeconomic data\euro_rates_03_04_2023.csv"
df.to_csv(file_to_save, index=True)

##### 2.5.2 Check save

In [42]:
df_check = pd.read_csv(file_to_save, index_col=0)
df_check.head()

Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.087
JPY,144.51
BGN,1.9558
CZK,23.44
DKK,7.4504


In [43]:
df_check.shape == df.shape

True

In [44]:
del df_check, file_to_save, df

## 3. Purchasing Power Parities

### 3.1 Importing a CSV file

In [45]:
df = pd.read_csv("data\RAW\_Socioeconomic data\purchasing_power_index.csv")
df.head()

Unnamed: 0,LOCATION,TIME,Value
0,AUS,2000,131208
1,AUS,2001,1327544
2,AUS,2002,133649
3,AUS,2003,1351978
4,AUS,2004,1364771


In [46]:
df.shape

(1454, 3)

### 3.2 Filter max year for each country

In [47]:
def filter_max_year(df: pd.DataFrame, location_col='LOCATION', year_col='TIME', value_col='Value'):

    max_years = df.groupby(location_col)[year_col].transform("max")

    filtered_df = df[df[year_col] == max_years].reset_index(drop=True)

    filtered_df.rename(columns={'Value': value_col}, inplace=True)

    return filtered_df

df = filter_max_year(df)
df

Unnamed: 0,LOCATION,TIME,Value
0,AUS,2022,1446332
1,AUT,2022,071893
2,BEL,2022,0713938
3,CAN,2022,1253835
4,CZE,2022,12723595
...,...,...,...
59,ALB,2021,42512743
60,SRB,2021,42392183
61,GEO,2021,0950974
62,CMR,2021,227378726


### 3.3 Rename columns

In [48]:
df.columns = ['Country', 'Year', 'Conversion']
df.set_index('Country', inplace=True)
df.head()

Unnamed: 0_level_0,Year,Conversion
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
AUS,2022,1446332
AUT,2022,71893
BEL,2022,713938
CAN,2022,1253835
CZE,2022,12723595


### 3.4 Remove duplicates

In [49]:
df.shape

(64, 2)

In [50]:
df = df.dropna(how="all")
df.shape

(64, 2)

### 3.5 Remove NaNs

In [51]:
df = df.dropna(how="all")
df.shape

(64, 2)

### 3.6 Preview

In [52]:
df

Unnamed: 0_level_0,Year,Conversion
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
AUS,2022,1446332
AUT,2022,071893
BEL,2022,0713938
CAN,2022,1253835
CZE,2022,12723595
...,...,...
ALB,2021,42512743
SRB,2021,42392183
GEO,2021,0950974
CMR,2021,227378726


### 3.7 Save CSV

#### 3.7.1 Save

In [53]:
file_to_save = "data\clean\_Socioeconomic data\purchasing_power_index.csv"
df.to_csv(file_to_save, index=True)

#### 3.7.2 Check save

In [54]:
df_check = pd.read_csv(file_to_save, index_col=0)
df_check.head()

Unnamed: 0_level_0,Year,Conversion
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
AUS,2022,1446332
AUT,2022,71893
BEL,2022,713938
CAN,2022,1253835
CZE,2022,12723595


In [55]:
df_check.shape == df.shape

True

In [56]:
del df_check, file_to_save, df