## Clean data

Part of the cleaning was done in spreadsheet software. It was just more convenient.

## 1. Importing all packages

In [1]:
# External
import pandas as pd

## 2. Euro rates

### 2.1 Importing  a CSV file

In [2]:
file_name = "euro_rates_04_14_2023.csv"
df = pd.read_csv(f"data\RAW\_Socioeconomic data\{file_name}")
df.head()

Unnamed: 0,Date,USD,JPY,BGN,CYP,CZK,DKK,EEK,GBP,HUF,...,INR,KRW,MXN,MYR,NZD,PHP,SGD,THB,ZAR,Unnamed: 42
0,2023-04-20,1.0944,147.46,1.9558,,23.502,7.4524,,0.88153,377.68,...,89.9365,1450.34,19.8156,4.8564,1.7763,61.429,1.4599,37.609,19.8552,
1,2023-04-19,1.0933,147.24,1.9558,,23.475,7.4514,,0.88108,376.35,...,89.8985,1458.87,19.8057,4.8564,1.7668,61.709,1.4615,37.708,19.9808,
2,2023-04-18,1.0972,146.89,1.9558,,23.373,7.4513,,0.88143,371.68,...,89.9955,1445.35,19.7174,4.865,1.7637,61.73,1.4614,37.623,19.9299,
3,2023-04-17,1.0981,146.97,1.9558,,23.345,7.452,,0.88373,371.7,...,90.0607,1444.13,19.7526,4.8558,1.7717,61.449,1.4615,37.753,19.8937,
4,2023-04-14,1.1057,146.6,1.9558,,23.341,7.451,,0.8844,373.68,...,90.3595,1438.43,19.9598,4.8673,1.7588,61.122,1.4665,37.66,19.9352,


In [3]:
filtered_df = df[df["Date"].str.contains("2023-04-14")]
df = filtered_df
df

Unnamed: 0,Date,USD,JPY,BGN,CYP,CZK,DKK,EEK,GBP,HUF,...,INR,KRW,MXN,MYR,NZD,PHP,SGD,THB,ZAR,Unnamed: 42
4,2023-04-14,1.1057,146.6,1.9558,,23.341,7.451,,0.8844,373.68,...,90.3595,1438.43,19.9598,4.8673,1.7588,61.122,1.4665,37.66,19.9352,


In [4]:
df.columns

Index(['Date', 'USD', 'JPY', 'BGN', 'CYP', 'CZK', 'DKK', 'EEK', 'GBP', 'HUF',
       'LTL', 'LVL', 'MTL', 'PLN', 'ROL', 'RON', 'SEK', 'SIT', 'SKK', 'CHF',
       'ISK', 'NOK', 'HRK', 'RUB', 'TRL', 'TRY', 'AUD', 'BRL', 'CAD', 'CNY',
       'HKD', 'IDR', 'ILS', 'INR', 'KRW', 'MXN', 'MYR', 'NZD', 'PHP', 'SGD',
       'THB', 'ZAR', 'Unnamed: 42'],
      dtype='object')

In [5]:
df.drop('Date', axis=1, inplace=True)

In [6]:
df.shape

(1, 42)

### 2.2 Reshape

In [7]:
filtered_df = pd.melt(
    df,
    var_name='Currency',
    value_name='Rate',
    ignore_index=True
)

filtered_df

df = filtered_df

df.set_index('Currency', inplace=True)

df.shape

(42, 1)

In [8]:
df

Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.1057
JPY,146.6
BGN,1.9558
CYP,
CZK,23.341
DKK,7.451
EEK,
GBP,0.8844
HUF,373.68
LTL,


### 2.2 Remove duplicates

In [9]:
df = df.drop_duplicates()
df.shape

(31, 1)

### 2.3 Remove NaNs

In [10]:
df.dropna(how="all", axis=0, inplace=True)
df.dropna(how="all", axis=1, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(how="all", axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(how="all", axis=1, inplace=True)


Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.1057
JPY,146.6
BGN,1.9558
CZK,23.341
DKK,7.451
GBP,0.8844
HUF,373.68
PLN,4.6435
RON,4.9423
SEK,11.3455


### 2.4 Convert EURO rate to USD

In [11]:
usd_rates = df['Rate'] / df.loc['USD']['Rate']

usd_rates.loc['EUR']  = 1 / df.loc['USD']['Rate']

df['Rate'] = usd_rates

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rate'] = usd_rates


Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
USD,1.0
JPY,132.585692
BGN,1.768834
CZK,21.109704
DKK,6.738718
GBP,0.799855
HUF,337.957855
PLN,4.199602
RON,4.469838
SEK,10.260921


### 2.5 Sort currencies

In [12]:
df.sort_index(inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_index(inplace=True)


Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
AUD,1.474993
BGN,1.768834
BRL,4.920865
CAD,1.331736
CHF,0.888758
CNY,6.851859
CZK,21.109704
DKK,6.738718
GBP,0.799855
HKD,7.849959


### 2.5 Save CSV

##### 2.5.1 Save

In [13]:
file_to_save = f"data\clean\_Socioeconomic data\{file_name}"
df.to_csv(file_to_save, index=True)

##### 2.5.2 Check save

In [14]:
df_check = pd.read_csv(file_to_save, index_col=0)

In [15]:
df_check.shape == df.shape

True

In [16]:
del df_check, file_to_save, df

## 3. Purchasing Power Parities

### 3.1 Importing a CSV file

In [None]:
df = pd.read_csv("data\RAW\_Socioeconomic data\DP_LIVE_04042023094701143.csv")
df.head()

In [None]:
df.shape

### 3.2 Filter max year for each country

In [None]:
def filter_max_year(df: pd.DataFrame, location_col='LOCATION', year_col='TIME', value_col='Value'):

    max_years = df.groupby(location_col)[year_col].transform("max")

    filtered_df = df[df[year_col] == max_years].reset_index(drop=True)

    filtered_df.rename(columns={'Value': value_col}, inplace=True)

    return filtered_df

df = filter_max_year(df)
df

### 3.3 Rename columns

In [None]:
df.columns = ['Country', 'Year', 'Conversion']
df.set_index('Country', inplace=True)
df.head()

### 3.4 Remove duplicates

In [None]:
df.shape

In [None]:
df = df.dropna(how="all")
df.shape

### 3.5 Remove NaNs

In [None]:
df = df.dropna(how="all")
df.shape

### 3.6 Preview

In [None]:
df

### 3.7 Save CSV

#### 3.7.1 Save

In [None]:
file_to_save = "data\clean\_Socioeconomic data\purchasing_power_index.csv"
df.to_csv(file_to_save, index=True)

#### 3.7.2 Check save

In [None]:
df_check = pd.read_csv(file_to_save, index_col=0)
df_check.head()

In [None]:
df_check.shape == df.shape

In [None]:
del df_check, file_to_save, df