In [1]:
import pandas as pd
from Levenshtein import distance as levenshtein_distance

In [2]:
df = pd.read_csv('perfectData.csv')

In [3]:
df.head()

Unnamed: 0,ID,inst,city,state,zip
0,100654,Alabama A & M University,Normal,AL,35899
1,100690,Amridge University,Montgomery,AL,36117-3553
2,100690,Amridge University,Montgomery,--,36117-3553
3,100760,Central Alabama Community College,Alexander City,AL,35010
4,100812,Athens State University,Athens,AL,35611


In [4]:
unique_names = df['inst'].unique()
print(unique_names)

['Alabama A & M University' 'Amridge University'
 'Central Alabama Community College' ...
 'Galen Health Institutes-Pikeville'
 'Commonwealth University of Pennsylvania' 'Radford University-Carilion']


In [5]:
unique_df = pd.DataFrame(unique_names, columns=['inst'])

In [6]:
unique_df

Unnamed: 0,inst
0,Alabama A & M University
1,Amridge University
2,Central Alabama Community College
3,Athens State University
4,Auburn University
...,...
4730,National Tractor Trailer School
4731,Galen Health Institutes-Houston
4732,Galen Health Institutes-Pikeville
4733,Commonwealth University of Pennsylvania


In [7]:
def generate_short_form(name):
    words = name.split()
    short_form = ''.join([word[0].upper() for word in words if word.lower() not in ['of', 'and', 'the']])
    return short_form

In [8]:
unique_df['short_form'] = unique_df['inst'].apply(generate_short_form)


In [9]:
unique_df

Unnamed: 0,inst,short_form
0,Alabama A & M University,AA&MU
1,Amridge University,AU
2,Central Alabama Community College,CACC
3,Athens State University,ASU
4,Auburn University,AU
...,...,...
4730,National Tractor Trailer School,NTTS
4731,Galen Health Institutes-Houston,GHI
4732,Galen Health Institutes-Pikeville,GHI
4733,Commonwealth University of Pennsylvania,CUP


In [10]:
df = df.merge(unique_df, on='inst', how='left')

In [11]:
df

Unnamed: 0,ID,inst,city,state,zip,short_form
0,100654,Alabama A & M University,Normal,AL,35899,AA&MU
1,100690,Amridge University,Montgomery,AL,36117-3553,AU
2,100690,Amridge University,Montgomery,--,36117-3553,AU
3,100760,Central Alabama Community College,Alexander City,AL,35010,CACC
4,100812,Athens State University,Athens,AL,35611,ASU
...,...,...,...,...,...,...
5016,498474,Galen Health Institutes-Houston,Houston,TX,77041-8241,GHI
5017,498483,Galen Health Institutes-Pikeville,Pikeville,KY,41501-1321,GHI
5018,498553,Stellar Career College,Chicago,IL,95356,SCC
5019,498562,Commonwealth University of Pennsylvania,Bloomsburg,PA,17815,CUP


In [12]:
df.to_csv('institutions_with_short_forms.csv', index=False)

In [13]:
print(df)

                 ID                                     inst            city  \
0            100654                 Alabama A & M University          Normal   
1            100690                       Amridge University      Montgomery   
2            100690                       Amridge University      Montgomery   
3            100760        Central Alabama Community College  Alexander City   
4            100812                  Athens State University          Athens   
...             ...                                      ...             ...   
5016         498474          Galen Health Institutes-Houston         Houston   
5017         498483        Galen Health Institutes-Pikeville       Pikeville   
5018         498553                   Stellar Career College         Chicago   
5019         498562  Commonwealth University of Pennsylvania      Bloomsburg   
5020  Not available              Radford University-Carilion   Not available   

     state            zip short_form  


In [14]:
df = pd.read_csv('institutions_with_short_forms.csv')

In [15]:
def find_closest_match(input_name, df):
    closest_match = None
    min_dist = float('inf')
    for index, row in df.iterrows():
        distance_instnm = levenshtein_distance(input_name.lower(), row['inst'].lower())
        distance_short_form = levenshtein_distance(input_name.lower(), row['short_form'].lower())
        if distance_instnm < min_dist:
            min_dist = distance_instnm
            closest_match = row

        if distance_short_form < min_dist:
            min_dist = distance_short_form
            closest_match = row

    return closest_match

In [16]:
input_name = 'AU'
closest_match = find_closest_match(input_name, df)
print(f"Closest match for '{input_name}':\n{closest_match}\n")

Closest match for 'AU':
ID                        100690
inst          Amridge University
city                  Montgomery
state                         AL
zip                   36117-3553
short_form                    AU
Name: 1, dtype: object



In [17]:
input_name = 'Amridge University'
closest_match = find_closest_match(input_name, df)
print(f"Closest match for '{input_name}':\n{closest_match}")

Closest match for 'Amridge University':
ID                        100690
inst          Amridge University
city                  Montgomery
state                         AL
zip                   36117-3553
short_form                    AU
Name: 1, dtype: object


In [18]:
input_name = 'ASU'
closest_match = find_closest_match(input_name, df)
print(f"Closest match for '{input_name}':\n{closest_match}\n")

Closest match for 'ASU':
ID                             100812
inst          Athens State University
city                           Athens
state                              AL
zip                             35611
short_form                        ASU
Name: 4, dtype: object



In [19]:
input_name = 'Alabama State University'
closest_match = find_closest_match(input_name, df)
print(f"Closest match for '{input_name}':\n{closest_match}\n")

Closest match for 'Alabama State University':
ID                              100654
inst          Alabama A & M University
city                            Normal
state                               AL
zip                              35899
short_form                       AA&MU
Name: 0, dtype: object



In [20]:
input_name = 'Athens State University'
closest_match = find_closest_match(input_name, df)
print(f"Closest match for '{input_name}':\n{closest_match}\n")

Closest match for 'Athens State University':
ID                             100812
inst          Athens State University
city                           Athens
state                              AL
zip                             35611
short_form                        ASU
Name: 4, dtype: object

