### TODO
- Manage NA values
- Manage useless features
    - `year` : merge with name
    - `link`, `logo` : delete
- Looking for strange values
    - score_i > score_j && rank_i > rank_j
- Using pandas categories
    - `country`, `city`, `region`
    - `type`, `size`, `research_ouptut`
- Looking for correlation between features


### Rapport du prof
- Tester différentes métriques
- Modèles à utiliser :
    - Linear model 
    - XGBoost ++
    - Decision tree
- Visualiser les données (matrice de corrélation)


### Notebook settings and imports

In [139]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [140]:
data_init = pd.read_csv("qs-world-university-rankings-2017-to-2022-V2.csv")
N_init, D_init = data_init.shape
data_init.head()

Unnamed: 0,university,year,rank_display,score,link,country,city,region,logo,type,research_output,student_faculty_ratio,international_students,size,faculty_count
0,Massachusetts Institute of Technology (MIT),2017,1,100.0,https://www.topuniversities.com/universities/m...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,4.0,3730,M,3065
1,Stanford University,2017,2,98.7,https://www.topuniversities.com/universities/s...,United States,Stanford,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,3.0,3879,L,4725
2,Harvard University,2017,3,98.3,https://www.topuniversities.com/universities/h...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,5.0,5877,L,4646
3,University of Cambridge,2017,4,97.2,https://www.topuniversities.com/universities/u...,United Kingdom,Cambridge,Europe,https://www.topuniversities.com/sites/default/...,Public,Very high,4.0,7925,L,5800
4,California Institute of Technology (Caltech),2017,5,96.9,https://www.topuniversities.com/universities/c...,United States,Pasadena,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,2.0,692,S,968


In [141]:
data_init.isna().sum()

university                   0
year                         0
rank_display                68
score                     3662
link                         0
country                      0
city                       178
region                       0
logo                         0
type                        12
research_output              2
student_faculty_ratio       75
international_students     164
size                         2
faculty_count               78
dtype: int64

### Clean-up : useless features

In [142]:
# drop useless columns
useless_columns = [
    "link",
    "logo"
]
data = data_init.drop(columns=useless_columns, axis=1)

# merge year with university's name
university_with_year = data["university"] + data["year"].map(" ({})".format)
data.insert(0, "univesity_with_year", university_with_year)
    # data = data.drop(columns=["university", "year"], axis=1)

In [143]:
# On change les N-M en N

rank_display_raw = data["rank_display"].map(lambda rank: str(rank).split("-")[0], na_action='ignore')
rank_display_raw = rank_display_raw.fillna(0)  # pour pouvoir convertir en entier
rank_display_raw = rank_display_raw.astype('int64')

data.insert(4, "rank_display_raw", rank_display_raw)

### Clean-up : NA values

In [144]:
def fill_score_if_possible():
    """
    regarder par année :
        - Regrouper par rangs 
        - Regarder s'il y a une valeur quelque part
            - S'il y en a plusieurs, vérifier si elles sont égales
        - Remplir les autres de cette valeur 

    de 3662 à 3533 NA
    """

    years = data["year"].unique()
    
    for year in years:
        # Regarder par année
        data_year = data.loc[data["year"] == year]

        ranks = data_year["rank_display"].unique()
        for rank in ranks:
            # Regrouper par rangs
            data_year_rank = data_year.loc[data["rank_display"] == rank]

            # S'il y a des égalités, on regarde s'il y a des valeurs manquantes
            # Si oui, on rempli par la moyenne des valeurs de la même catégorie
            if data_year_rank.shape[0] > 1:
                data_year_rank_scores = data_year_rank["score"]
                if data_year_rank_scores.isna().sum() > 0:
                    data.loc[
                        (data["year"] == year) & (data["rank_display"] == rank),
                        "score"
                    ] = data_year_rank_scores.mean()
            
fill_score_if_possible()


In [146]:
data.isna().sum().loc[data.isna().sum() > 0]

data_with_na = data.loc[data.isna().any(axis=1)]
data_count_na_by_row = data_with_na.isna().sum(axis=1)
data_count_na_by_row.sort_values(ascending=False)

max_na_count = 5
data_with_too_much_na_idxs = data_count_na_by_row.loc[data_count_na_by_row >= max_na_count].index

data.drop(data_with_too_much_na_idxs, inplace=True)
