### Let's start with reviewing the dataset for World QS ranking 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("uniquest_data_generator/data/raw/qs_world_ranking_university.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'uniquest_data_generator/data/raw/qs_world_ranking_university.csv'

#### Checking all column names

In [None]:
column_name = df.columns
column_name

Index(['2024 RANK', '2023 RANK', 'Institution Name', 'Country Code', 'Country',
       'SIZE', 'FOCUS', 'RES.', 'AGE', 'STATUS', 'Academic Reputation Score',
       'Academic Reputation Rank', 'Employer Reputation Score',
       'Employer Reputation Rank', 'Faculty Student Score',
       'Faculty Student Rank', 'Citations per Faculty Score',
       'Citations per Faculty Rank', 'International Faculty Score',
       'International Faculty Rank', 'International Students Score',
       'International Students Rank', 'International Research Network Score',
       'International Research Network Rank', 'Employment Outcomes Score',
       'Employment Outcomes Rank', 'Sustainability Score',
       'Sustainability Rank', 'Overall SCORE'],
      dtype='object')

In [None]:
len(df.columns)

29

#### Renaming columns for better code handling

In [None]:
df.columns = [
    'rank_2024', 'rank_2023', 'institution_name', 'country_code', 'country',
    'size', 'focus', 'research_intensity', 'age', 'status',
    'academic_reputation_score', 'academic_reputation_rank',
    'employer_reputation_score', 'employer_reputation_rank',
    'faculty_student_score', 'faculty_student_rank',
    'citations_per_faculty_score', 'citations_per_faculty_rank',
    'international_faculty_score', 'international_faculty_rank',
    'international_students_score', 'international_students_rank',
    'international_research_network_score', 'international_research_network_rank',
    'employment_outcomes_score', 'employment_outcomes_rank',
    'sustainability_score', 'sustainability_rank',
    'overall_score'
]
df.columns

Index(['rank_2024', 'rank_2023', 'institution_name', 'country_code', 'country',
       'size', 'focus', 'research_intensity', 'age', 'status',
       'academic_reputation_score', 'academic_reputation_rank',
       'employer_reputation_score', 'employer_reputation_rank',
       'faculty_student_score', 'faculty_student_rank',
       'citations_per_faculty_score', 'citations_per_faculty_rank',
       'international_faculty_score', 'international_faculty_rank',
       'international_students_score', 'international_students_rank',
       'international_research_network_score',
       'international_research_network_rank', 'employment_outcomes_score',
       'employment_outcomes_rank', 'sustainability_score',
       'sustainability_rank', 'overall_score'],
      dtype='object')

### **Key Columns (Useful for Analysis)**
1. **2024 RANK / 2023 RANK** – Current and previous year's global ranking.  
2. **Institution Name** – Name of the university.  
3. **Country Code / Country** – Location of the institution.  
4. **SIZE** – Size of the institution (Small, Medium, Large).  
5. **FOCUS** – Specialization (e.g., Comprehensive, Subject-specific).  
6. **RES.** – Research output intensity (Very High, High, etc.).  
7. **AGE** – How old the institution is (e.g., Established, Young).  
8. **STATUS** – Public or Private.  
9. **Academic Reputation Score/Rank** – Based on global academic surveys.  
10. **Employer Reputation Score/Rank** – Based on employer opinions.  
11. **Faculty Student Score/Rank** – Measures teaching quality (student-to-faculty ratio).  
12. **Citations per Faculty Score/Rank** – Research impact (citation strength).  
13. **International Faculty/Students Score/Rank** – Measures diversity.  
14. **International Research Network Rank** – Global collaboration strength.  
15. **Employment Outcomes Rank** – Graduate employability.  
16. **Sustainability Rank** – Environmental and social impact.  
17. **Overall SCORE** – Composite score determining the rank.  
- Some **rank columns** (e.g., *Sustainability Rank*) may not be relevant if you're only interested in academic performance.  
- **Country Code** is redundant if you already have *Country*.  
- **STATUS** (Public/Private) may not matter for some analyses.  
- **Rankings (2024/2023)** – For trend analysis.  
- **Reputation Scores (Academic/Employer)** – Key for prestige.  
- **Faculty Student Ratio** – Indicates teaching quality.  
- **Citations per Faculty** – Research influence.  
- **International Metrics** – Useful for diversity studies.  
- **Overall SCORE** – Best for ranking comparisons.  


In [None]:
len(df.columns)

29

In [None]:
columns_to_drop = ["size", "focus", "research_intensity", "age", "citations_per_faculty_score", "sustainability_rank", "country" ]
df = df.drop(columns = columns_to_drop)

In [None]:
df_us = df[df.country_code == "US"]

In [None]:
df_us

Unnamed: 0,rank_2024,rank_2023,institution_name,country_code,status,academic_reputation_score,academic_reputation_rank,employer_reputation_score,employer_reputation_rank,faculty_student_score,...,international_faculty_score,international_faculty_rank,international_students_score,international_students_rank,international_research_network_score,international_research_network_rank,employment_outcomes_score,employment_outcomes_rank,sustainability_score,overall_score
1,1,1,Massachusetts Institute of Technology (MIT),US,B,100.0,4,100.0,2,100.0,...,100.0,56,88.2,128,94.3,58,100,4,95.2,100.0
4,4,5,Harvard University,US,B,100.0,1,100.0,1,98.3,...,84.6,210,66.8,223,100.0,5,100,1,96.7,98.3
5,5,3,Stanford University,US,B,100.0,5,100.0,3,100.0,...,99.9,78,51.2,284,95.8,44,100,2,94.4,98.1
10,10,27,"University of California, Berkeley (UCB)",US,A,100.0,6,100.0,6,20.5,...,92.2,172,63.9,235,92.4,66,98.7,15,100,90.4
11,11,10,University of Chicago,US,B,99.1,17,95.4,38,92.5,...,81.2,225,84.4,145,61.4,236,98.3,17,81.7,90.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,1401+,1201-1400,California State University - Los Angeles,US,A,5.4,601+,2.5,601+,3.5,...,4.0,701+,5.6,701+,1.1,701+,9.5,701+,,-
1416,1401+,1401+,California State University Long Beach,US,A,4.0,601+,2.8,601+,8.1,...,,,,,,,9.6,701+,,-
1420,1401+,1201-1400,Humboldt State University,US,A,3.8,601+,2.2,601+,8.9,...,2.3,701+,1.6,701+,1.0,701+,7.9,701+,,-
1421,1401+,1201-1400,Illinois State University,US,,3.7,601+,3.5,601+,5.9,...,4.9,701+,2.5,701+,1.1,701+,10.9,701+,,-


In [None]:
for column in df.columns:
    print(df[column_name].isnull().sum())
# for column in df.columns:
#     print(f"{column}: {df[column].isnull().sum()} null values")