In [1]:
import pandas as pd

In [2]:
qs_data = pd.read_csv('qs_data.csv')
scorecard_data = pd.read_csv('scorecard_data.csv')
ntu_data = pd.read_csv('ntu_research_ranking.csv')
size_data = pd.read_csv('campus_data.csv')
sentiment_data = pd.read_csv('RMP_Data_Clean_Organized.csv')

In [3]:
qs_data = qs_data[['institution',
                   'ar score',
                   'er score',
                   'isr score',
                   'fsr score',
                   'ifr score']]

In [4]:
standard_names = ['Princeton University',
 'Massachusetts Institute of Technology',
 'Harvard University',
 'Stanford University',
 'Yale University',
 'California Institute of Technology',
 'Duke University',
 'Johns Hopkins University',
 'Northwestern University',
 'University of Pennsylvania',
 'Cornell University',
 'University of Chicago',
 'Brown University',
 'Columbia University',
 'Dartmouth College',
 'University of California, Los Angeles',
 'University of California, Berkeley',
 'Rice University',
 'University of Notre Dame',
 'Vanderbilt University',
 'Carnegie Mellon University',
 'University of Michigan, Ann Arbor',
 'Washington University',
 'Emory University',
 'Georgetown University',
 'University of Virginia',
 'University of North Carolina, Chapel Hill',
 'University of Southern California',
 'University of California, San Diego',
 'New York University',
 'University of Florida',
 'University of Texas, Austin',
 'Georgia Institute of Technology',
 'University of California, Davis',
 'University of California, Irvine',
 'University of Illinois, Urbana-Champaign',
 'Boston College',
 'Tufts University',
 'University of California, Santa Barbara',
 'University of Wisconsin, Madison',
 'Boston University',
 'Ohio State University, Columbus',
 'Rutgers University, New Brunswick',
 'University of Maryland, College Park',
 'University of Rochester',
 'Lehigh University',
 'Purdue University',
 'University of Georgia',
 'University of Washington',
 'Wake Forest University']

In [5]:
from fuzzywuzzy import process

def standardize_column_values(df, column, standard_names, threshold=80):

    updated_values = []
    
    for value in df[column]:
        match, score = process.extractOne(value, standard_names)
        if score >= threshold:
            updated_values.append(match)  # Use the matched standard name
        else:
            updated_values.append(value)  # Keep the original name if no good match
    
    df[column] = updated_values
    return df

In [6]:
ntu_data = standardize_column_values(ntu_data, 'University Name', standard_names, threshold=80)
qs_data = standardize_column_values(qs_data, 'institution', standard_names, threshold=80)
size_data = standardize_column_values(size_data, 'College or University', standard_names, threshold=80)
scorecard_data = standardize_column_values(scorecard_data, 'University Name', standard_names, threshold=80)
sentiment_data = standardize_column_values(sentiment_data, 'University', standard_names, threshold=80)

size_data.rename(columns={'College or University': 'University Name'}, inplace=True)
qs_data.rename(columns={'institution': 'University Name'}, inplace=True)
sentiment_data.rename(columns={'University': 'University Name'}, inplace=True)

size_data.loc[11, 'University Name'] = 'Ohio State University, Columbus'
qs_data.loc[31, 'University Name'] = 'Ohio State University, Columbus'

In [7]:
merged_df = scorecard_data.merge(ntu_data, on='University Name', how='outer') \
                          .merge(qs_data, on='University Name', how='outer') \
                          .merge(size_data, on='University Name', how='outer') \
                          .merge(sentiment_data, on='University Name', how='outer')

merged_df = merged_df.drop_duplicates(subset=['University Name']).reset_index(drop=True)

merged_df['University ID'] = range(1, len(merged_df) + 1)

In [8]:
merged_df.to_csv('merged_datasets.csv',index=False)