# Correcting indexing and currency conversion in data set

We need to define and convert all indices in data set to have the same weight as default and also need a show the currency of all paid metrics.

In [6]:
# Importing modules and libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

In [7]:
# Directory searching and opening the data set
df = pd.read_excel("../completeDatasets/UniMatch.xlsx")
df["collegeRank"] = df["collegeRank"].astype(str)
df["tuition"] = df["tuition (EUR/year)"].astype(str)
#df.iloc[:, :15].head()

## Currency conversion

We want to all the prices in EUR for easier use and conversion later on.<br>
Currently all the prices are in EUR except for "healthcare price" which is in USD.


In [8]:
# Price conversion
USD_to_EUR_rate = 0.95 # information from 10th December 2024
df["healthcare price"] = pd.to_numeric(df["healthcare price"], errors="coerce")
df["healthcare price"] = df["healthcare price"] * USD_to_EUR_rate
df.iloc[:, :15].head()

Unnamed: 0,country,region,university,collegeRank,tuition (EUR/year),percOfIntStud,acceptance rate,avgSafetyIndex,cost of living index,rent index,groceries index,recreationCost (EUR/month),healthcare price,avgMntTransportCost,link
0,Albania,Europe,University of Tirana,4212,532,0.4,24.0,56.3,42.1,10.6,42.0,38.0,263.597609,17.5,https://unitir.edu.al/eng/
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0.0,56.0,47.94,28.9,3.8,36.8,31.52,74.879197,10.09,https://www.usthb.dz/
2,Argentina,Southern America,Universidad de Buenos Aires,71 / 272,0,25.0,64.0,36.46,29.4,7.6,29.7,68.3,221.992907,11.0,https://www.uba.ar/
3,Armenia,Asia,Yerevan State University,951 / 2427,555,2.0,20.0,77.72,41.0,19.0,36.0,42.43,457.9,10.35,https://www.ysu.am/en
4,Australia,Oceania,The University of Melbourne,13 / 47,9749,48.0,77.0,55.96,70.2,33.4,77.3,87.0,926.63253,114.375,https://www.unimelb.edu.au/


## Converting index values to prices

New York is our index reference at index of 100. For more information <a href="https://www.numbeo.com/cost-of-living/cpi_explained.jsp"> click here</a> or find reference in <a href="./../docs/docs.docx">UniMatch documentation.<a> 

In [9]:
# Index references
refLivingCost = 1607 # average monthly expenses in New York for 1 person in euros
refRentCost = 3833 # average monthly rent price for 1 bedroom apartment in center in euros
refGroceriesCost = 546  #Monthly recommended minimum amount of money for food per person in euros

Now we take the index values inside indexed columns and convert them to prices using the referce values above

In [10]:
df["livingCost"] = ((df["cost of living index"] * refLivingCost) / 100).round(1)
df["rentCost"] = ((df["rent index"] * refRentCost) / 100).round(1)
df["groceriesCost"] = ((df["groceries index"] * refGroceriesCost) / 100).round(1)
# df.head(26)

In [11]:
# renaming the columns
df.rename(columns={"collegeRank": "ranking", "acceptance rate": "acceptanceRate", "cost of living index": "livingCostIndex", "rent index": "rentIndex", "groceries index": "groceriesIndex", "recreationCost (EUR/month)": "recreationCost", "healthcare price": "healthcareCost", "avgMntTransportCost": "transportCost", "avgSafetyIndex": "safetyIndex"},inplace=True)

In [12]:
pricesDF = df[["country", "region", "university", "ranking", "tuition", "percOfIntStud", "acceptanceRate", "safetyIndex", "livingCost", "rentCost", "groceriesCost", "recreationCost", "healthcareCost", "transportCost", "link", "Computer Science", "Business", "Economics", "Psychology", "Biology", "Law", "Medicine", "Mathematics", "Art", "Physics"]]
pricesDF

Unnamed: 0,country,region,university,ranking,tuition,percOfIntStud,acceptanceRate,safetyIndex,livingCost,rentCost,...,Computer Science,Business,Economics,Psychology,Biology,Law,Medicine,Mathematics,Art,Physics
0,Albania,Europe,University of Tirana,4212,532,0.4,24.0,56.300,676.5,406.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0.0,56.0,47.940,464.4,145.7,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,Argentina,Southern America,Universidad de Buenos Aires,71 / 272,0,25.0,64.0,36.460,472.5,291.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Armenia,Asia,Yerevan State University,951 / 2427,555,2.0,20.0,77.720,658.9,728.3,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
4,Australia,Oceania,The University of Melbourne,13 / 47,9749,48.0,77.0,55.960,1128.1,1280.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Uruguay,Southern America,Universidad de la República (Udelar),672 / 1453,448.6,3.0,54.0,47.740,869.4,529.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
116,Uzbekistan,Asia,Tashkent Institute of Irrigation and Agricultu...,547 / 8964,1387.06,5.0,23.0,66.875,424.2,429.3,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
117,Venezuela,Southern America,Universidad Central de Venezuela,695 / 1151,460.37,1.0,10.0,16.780,615.5,222.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
118,Vietnam,Asia,Duy Tan University,495 / 2379,1458.73,1.0,30.0,55.220,461.2,345.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


Some ranks and tuition prices are divided with "/ because of multiple sources. We will be taking only the second part.

In [13]:
pricesDF["ranking"] = pricesDF["ranking"].apply(lambda x: x.split("/")[-1])
pricesDF["tuition"] = pricesDF["tuition"].apply(lambda x: x.split("/")[-1])
pricesDF["ranking"] = pricesDF["ranking"].astype(int)
# pricesDF.head(30)

### Saving the new data set including only prices

In [14]:
pricesDF.to_csv("../completeDatasets/pricesComplete.csv", index=False)
pricesDF.to_excel("../completeDatasets/pricesComplete.xlsx", index=False)

## Converting prices to indices

For filtering and ranking we will use indices because we can properly add weight to the decision.<br>
This means we need to convert all prices we have to indices if we don't have them already.<br>
For the reference point we will use Switzerland because it is closest to 100 index in livingCostIndex category.

In [15]:
# extraction reference values
refIndex = 101.1
refRecreationCost = round(float(df.loc[df["country"] == "Switzerland", "recreationCost"]),1)
refHealthcareCost = round(float(df.loc[df["country"] == "Switzerland", "healthcareCost"]),1)
refTransportCost = round(float(df.loc[df["country"] == "Switzerland", "transportCost"]),1)
# print(refRecreationCost, refHealthcareCost, refTransportCost)

In [16]:
df["recreationIndex"] = ((df["recreationCost"] * refIndex) / refRecreationCost).round(1)
df["healthcareIndex"] = ((df["healthcareCost"] * refIndex) / refHealthcareCost).round(1)
df["transportIndex"] = ((df["transportCost"] * refIndex) / refTransportCost).round(1)
df.head()

Unnamed: 0,country,region,university,ranking,tuition (EUR/year),percOfIntStud,acceptanceRate,safetyIndex,livingCostIndex,rentIndex,...,Art,Physics,college,tuition,livingCost,rentCost,groceriesCost,recreationIndex,healthcareIndex,transportIndex
0,Albania,Europe,University of Tirana,4212,532,0.4,24.0,56.3,42.1,10.6,...,1.0,1.0,,532,676.5,406.3,229.3,33.1,11.3,19.1
1,Algeria,Africa,University of Sciences and Technology Houari B...,2890,840,0.0,56.0,47.94,28.9,3.8,...,0.0,1.0,University of Sciences and Technology Houari B...,840,464.4,145.7,200.9,27.4,3.2,11.0
2,Argentina,Southern America,Universidad de Buenos Aires,71 / 272,0,25.0,64.0,36.46,29.4,7.6,...,1.0,1.0,,0,472.5,291.3,162.2,59.4,9.5,12.0
3,Armenia,Asia,Yerevan State University,951 / 2427,555,2.0,20.0,77.72,41.0,19.0,...,1.0,1.0,,555,658.9,728.3,196.6,36.9,19.7,11.3
4,Australia,Oceania,The University of Melbourne,13 / 47,9749,48.0,77.0,55.96,70.2,33.4,...,1.0,1.0,,9749,1128.1,1280.2,422.1,75.7,39.9,124.9


### Saving the new data set including only indices

In [17]:
indicesDF = df[["country", "region", "university", "ranking", "tuition", "percOfIntStud", "acceptanceRate", "safetyIndex", "livingCostIndex", "rentIndex", "groceriesIndex", "recreationIndex", "healthcareIndex", "transportIndex", "link", "Computer Science", "Business", "Economics", "Psychology", "Biology", "Law", "Medicine", "Mathematics", "Art", "Physics"]]
#indicesDF.iloc[:5,:15]

In [18]:
indicesDF.to_csv("../completeDatasets/indexComplete.csv", index=False)
indicesDF.to_excel("../completeDatasets/indexComplete.xlsx", index=False)