# Correcting indexing and currency conversion in data set

We need to define and convert all indices in data set to have the same weight as default and also need a show the currency of all paid metrics.

In [79]:
# Importing modules and libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

In [80]:
# Directory searching and opening the data set
cwd = os.getcwd()
# print(cwd)
df = pd.read_excel("UniMatch.xlsx")
df["collegeRank"] = df["collegeRank"].astype(str)
df.iloc[:, :15].head()

Unnamed: 0,country,region,university,collegeRank,tuition (EUR/year),percOfIntStud,acceptance rate,avgSafetyIndex,cost of living index,rent index,groceries index,recreationCost (EUR/month),healthcare price,avgMntTransportCost,link
0,Albania,Europe,University of Tirana,4212,532,0.4,24,56.3,42.1,10.6,42.0,38.0,277.471168,17.5,https://unitir.edu.al/eng/
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0.0,56,47.94,28.9,3.8,36.8,31.52,78.820208,10.09,https://www.usthb.dz/
2,Argentina,Southern America,Universidad de Buenos Aires,71 / 272,0,25.0,64,36.46,29.4,7.6,29.7,68.3,233.676745,11.0,https://www.uba.ar/
3,Armenia,Asia,Yerevan State University,951 / 2427,555,2.0,20,77.72,41.0,19.0,36.0,42.43,482.0,10.35,https://www.ysu.am/en
4,Australia,Oceania,The University of Melbourne,13 / 47,9749,48.0,77,55.96,70.2,33.4,77.3,87.0,975.402664,114.375,https://www.unimelb.edu.au/


## Currency conversion

We want to all the prices in EUR for easier use and conversion later on.<br>
Currenty all the prices are in EUR except for "healthcare price" which is in USD.


In [81]:
# Price conversion
USD_to_EUR_rate = 0.95 # information from 10th December 2024
df["healthcare price"] = pd.to_numeric(df["healthcare price"], errors="coerce")
df["healthcare price"] = df["healthcare price"] * USD_to_EUR_rate
df.iloc[:, :15].head()

Unnamed: 0,country,region,university,collegeRank,tuition (EUR/year),percOfIntStud,acceptance rate,avgSafetyIndex,cost of living index,rent index,groceries index,recreationCost (EUR/month),healthcare price,avgMntTransportCost,link
0,Albania,Europe,University of Tirana,4212,532,0.4,24,56.3,42.1,10.6,42.0,38.0,263.597609,17.5,https://unitir.edu.al/eng/
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0.0,56,47.94,28.9,3.8,36.8,31.52,74.879197,10.09,https://www.usthb.dz/
2,Argentina,Southern America,Universidad de Buenos Aires,71 / 272,0,25.0,64,36.46,29.4,7.6,29.7,68.3,221.992907,11.0,https://www.uba.ar/
3,Armenia,Asia,Yerevan State University,951 / 2427,555,2.0,20,77.72,41.0,19.0,36.0,42.43,457.9,10.35,https://www.ysu.am/en
4,Australia,Oceania,The University of Melbourne,13 / 47,9749,48.0,77,55.96,70.2,33.4,77.3,87.0,926.63253,114.375,https://www.unimelb.edu.au/


## Converting index values to prices

New York is our index reference at index of 100. For more information <a href="https://www.numbeo.com/cost-of-living/cpi_explained.jsp"> click here</a> or find reference in <a href="./../docs/docs.docx">UniMatch documentation.<a> 

In [82]:
# Index references
refLivingCost = 1607 # average monthly expenses in New York for 1 person in euros
refRentCost = 3833 # average monthly rent price for 1 bedroom apartment in center in euros
refGroceriesCost = 546  #Monthly recommended minimum amount of money for food per person in euros

Now we take the index values inside indexed columns and convert them to prices using the referce values above

In [83]:
df["livingCost"] = ((df["cost of living index"] * refLivingCost) / 100).round(1)
df["rentCost"] = ((df["rent index"] * refRentCost) / 100).round(1)
df["groceriesCost"] = ((df["groceries index"] * refGroceriesCost) / 100).round(1)
# df.head(26)

In [84]:
# renaming the columns
df.rename(columns={"collegeRank": "ranking", "tuition (EUR/year)": "tuition", "acceptance rate": "acceptanceRate", "cost of living index": "livingCostIndex", "rent index": "rentIndex", "groceries index": "groceriesIndex", "recreationCost (EUR/month)": "recreationCost", "healthcare price": "healthcareCost", "avgMntTransportCost": "transportCost", "avgSafetyIndex": "safetyIndex"},inplace=True)

In [85]:
pricesDF = df[["country", "region", "university", "ranking", "tuition", "percOfIntStud", "acceptanceRate", "safetyIndex", "livingCost", "rentCost", "groceriesCost", "recreationCost", "healthcareCost", "transportCost", "link", "Computer Science", "Business", "Economics", "Psychology", "Biology", "Law", "Medicine", "Mathematics", "Art", "Physics"]]
# pricesDF.head()

Some ranks are divided with '/' because of multiple sources. We will be taking only the second part.

In [86]:
pricesDF["ranking"] = pricesDF["ranking"].apply(lambda x: x.split("/")[-1])
pricesDF["ranking"] = pricesDF["ranking"].astype(int)
pricesDF.head(30)

Unnamed: 0,country,region,university,ranking,tuition,percOfIntStud,acceptanceRate,safetyIndex,livingCost,rentCost,...,Computer Science,Business,Economics,Psychology,Biology,Law,Medicine,Mathematics,Art,Physics
0,Albania,Europe,University of Tirana,4212,532,0.4,24,56.3,676.5,406.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0,56,47.94,464.4,145.7,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,Argentina,Southern America,Universidad de Buenos Aires,272,0,25,64,36.46,472.5,291.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Armenia,Asia,Yerevan State University,2427,555,2,20,77.72,658.9,728.3,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
4,Australia,Oceania,The University of Melbourne,47,9749,48,77,55.96,1128.1,1280.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,Austria,Europe,University of Vienna,97,1453,36,28,73.68,1046.2,862.4,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
6,Azerbaijan,Asia,Baku State University,3877,1636,2,20,68.2,501.4,333.5,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
7,Bahamas,Southern America,University of The Bahamas,6157,5320,-,60,38.68,1366.0,1406.7,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,Bahrain,Asia,Applied Science University,10954,18381,21,53,74.46,840.5,785.8,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,Bangladesh,Asia,University of Dhaka,991,116,0,11,36.88,361.6,92.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


print(pricesDF)

### Saving the new data set including only prices

In [76]:
pricesDF.to_csv('pricesComplete.csv', index=False)
pricesDF.to_excel('pricesComplete.xlsx', index=False)
pricesDF.to_parquet('pricesComplete.parquet')

ArrowInvalid: ("Could not convert '0/2500' with type str: tried to convert to int64", 'Conversion failed for column tuition with type object')