In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
#reading csv file
college = pd.read_csv("Top 100 Private Colleges.2003.csv")

In [3]:
print(college.loc[3])

Overall Rank                                      4
School                           Swarthmore College
State                                            PA
Undergrad. Enrollment                          1479
Admission Rate                                  24%
*SAT or ACT                                  94/98%
Student/faculty Ratio                             8
4-year Grad. Rate                               86%
6-year Grad. Rate                               92%
Quality Rank                                      4
Total Costs                                $38,676 
Cost After Need-based Aid                  $17,386 
Need Met                                       100%
Aid From Grants                                 85%
Cost After Non-Need-Based Aid              $11,404 
Non-Need-Based Aid+                              2%
Average Debt                               $12,759 
Cost Rank                                        21
Name: 3, dtype: object


In [4]:
#removing columns having nan values
column_list = college.columns
for column_i in column_list:
    college = college.drop(college.index[college[column_i].isin([np.nan])])
print("Data after cleaning :",len(college))

Data after cleaning : 73


# Normalisation

In [5]:
#function for converting money value from string to float
def removesymbolcurrency(x):
    return float(x.split('$')[1].replace(",", ""))

#function for converting percentage data from string to float
def removesymbolpercent(x):
    if x=='NA%':
        return np.nan
    if type(x) == float:
        return x
    return float(x.split('%')[0])

#function for normalisation of columns data
def normalisation(x):
    if x==np.nan:
        return x
    return (float(x) - minimum)/maximum

In [6]:
#cleaning admission rate data
column_name = "Admission Rate"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning 4-year Grad. Rate data
column_name = "4-year Grad. Rate"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning 6-year Grad. Rate data
column_name = "6-year Grad. Rate"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning Total Costs data
column_name = "Total Costs"
college[column_name] = college[column_name].apply(removesymbolcurrency)

#cleaning Cost After Need-based Aid data
column_name = "Cost After Need-based Aid"
college[column_name] = college[column_name].apply(removesymbolcurrency)

#cleaning Need Met data
column_name = "Need Met"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning Aid From Grants data
column_name = "Aid From Grants"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning Cost After Non-Need-Based Aid data
column_name = "Cost After Non-Need-Based Aid"
college[column_name] = college[column_name].apply(removesymbolcurrency)

#cleaning Non-Need-Based Aid+ data
column_name = "Non-Need-Based Aid+"
college[column_name] = college[column_name].apply(removesymbolpercent)

#cleaning Average Debt data
column_name = "Average Debt"
college[column_name] = college[column_name].apply(removesymbolcurrency)



In [7]:
#function for converting sat or act to sat data in new column
def satconverter(x):
    if '/' in x:
        return float(x.split('/')[0].replace("*",""))
    else:
        return np.nan

#function for converting sat or act to act data in new column
def actconverter(x):
    if '/' in x:
        return float(x.split('/')[1].replace("%", "").replace("*",""))
    else:
        return float(x.replace("%",""))

In [8]:
college["SAT"] = college["*SAT or ACT"].apply(satconverter)
college["ACT"] = college["*SAT or ACT"].apply(actconverter)

In [9]:
#removing columns having nan values
column_list = college.columns
for column_i in column_list:
    college = college.drop(college.index[college[column_i].isin([np.nan])])
print("Data after cleaning :",len(college))

Data after cleaning : 59


In [10]:
#undergrad enrollment normalisation
column_name = "Undergrad. Enrollment"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [11]:
#Admission Rate normalisation
column_name = "Admission Rate"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [12]:
#Student/faculty Ratio normalisation
column_name = "Student/faculty Ratio"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [13]:
#4-year Grad. Rate normalisation
column_name = "4-year Grad. Rate"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [14]:
#6-year Grad. Rate normalisation
column_name = "6-year Grad. Rate"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [15]:
#Total Costs normalisation
column_name = "Total Costs"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [16]:
#Cost After Need-based Aid normalisation
column_name = "Cost After Need-based Aid"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [17]:
#Need Met normalisation
column_name = "Need Met"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [18]:
#Aid From Grants normalisation
column_name = "Aid From Grants"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [19]:
#Cost After Non-Need-Based Aid normalisation
column_name = "Cost After Non-Need-Based Aid"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [20]:
#Non-Need-Based Aid+ normalisation
column_name = "Non-Need-Based Aid+"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [21]:
#Average Debt normalisation
column_name = "Average Debt"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [22]:
#SAT normalisation
column_name = "SAT"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [23]:
#ACT normalisation
column_name = "ACT"

minimum = float(college[column_name].min())
maximum = float(college[column_name].max())
college[column_name] = college[column_name].apply(normalisation)

In [24]:
college = college.drop(columns=['*SAT or ACT'])

In [25]:
print(college.loc[3])

Overall Rank                                      4
School                           Swarthmore College
State                                            PA
Undergrad. Enrollment                     0.0724474
Admission Rate                             0.128205
Student/faculty Ratio                      0.294118
4-year Grad. Rate                          0.966292
6-year Grad. Rate                          0.252632
Quality Rank                                      4
Total Costs                                0.760363
Cost After Need-based Aid                  0.411724
Need Met                                        0.8
Aid From Grants                                0.58
Cost After Non-Need-Based Aid             0.0882829
Non-Need-Based Aid+                            0.01
Average Debt                               0.452174
Cost Rank                                        21
SAT                                            0.45
ACT                                            0.43
Name: 3, dty

# Information Gain

In [61]:
#Columns without ranks
column_list = ['Undergrad. Enrollment', 'Admission Rate', 'Student/faculty Ratio', '4-year Grad. Rate', 
               'Total Costs','Cost After Need-based Aid', 'Need Met', 'Aid From Grants',
               'Cost After Non-Need-Based Aid', 'Non-Need-Based Aid+', 'Average Debt']

In [62]:
#function for evaluating information
def Information(freq_value):
    inf = 0
    sum_val = sum(freq_value)
    for freq in freq_value:
        prob = freq / sum_val
        if prob != 0:
            inf = inf - prob * math.log(prob, 2)
    return inf

In [63]:
#function for evaluating entropy
def Entropy(prob, inf):
    entropy_val = 0
    for i in range(len(prob)):
        entropy_val += prob[i] * inf[i]
    return entropy_val

In [64]:
state_count = college['State'].value_counts().tolist()

In [65]:
state_information = Information(state_count)
print('Information = ', state_information)

Information =  4.045413340232901


In [67]:
category_ranges = [(0, 0.33), (0.34, 0.67), (0.68, 1)]

for col in column_list :
    infos = [] 
    probs = [] 
    for minimum, maximum in category_ranges:
        # frequencies of records satisfying the category and state
        freq_list = [] 
        category = college[col].between( minimum, maximum, inclusive=True)
        for state in college['State'].unique() :
            state_rec = category.loc[college['State'] == state].tolist()
            freq = sum(state_rec)
            freq_list.append(freq)
        
        infos.append(Information(freq_list))
        probs.append(sum(freq_list) / len(college))
    
    # Calculate entropy for this column
    entrpy = Entropy(probs, infos)
    gain = state_information - entrpy
    print("Gain({}) = {}".format(col, gain))

Gain(Undergrad. Enrollment) = 0.32092490940388974
Gain(Admission Rate) = 0.5405022103042527
Gain(Student/faculty Ratio) = 0.3453555656376115
Gain(4-year Grad. Rate) = 0.4288020592904811
Gain(Total Costs) = 0.9703103476744439
Gain(Cost After Need-based Aid) = 0.4130737055123621
Gain(Need Met) = 0.24367615249043606
Gain(Aid From Grants) = 0.3052452271606141
Gain(Cost After Non-Need-Based Aid) = 0.609044293251022
Gain(Non-Need-Based Aid+) = 0.4875826007501036
Gain(Average Debt) = 0.543601539010218


In [68]:
#Rank Columns
column_list = ['Overall Rank', 'Cost Rank', 'Quality Rank']

In [72]:
category_ranges = [(1, 33), (34, 67), (68, 100)]
for col in column_list :
    infos = [] 
    probs = [] 
    for minimum, maximum in category_ranges:
        # frequencies of records satisfying the category and state
        freq_list = [] 
        category = college[col].between( minimum, maximum, inclusive=True)
        for state in college['State'].unique() :
            state_rec = category.loc[college['State'] == state].tolist()
            freq = sum(state_rec)
            freq_list.append(freq)
        infos.append(Information(freq_list))
        probs.append(sum(freq_list) / len(college))
    
    # Calculate entropy for this column
    entrpy = Entropy(probs, infos)
    gain = state_information - entrpy
    print("Gain({}) = {}".format(col, gain))

Gain(Overall Rank) = 0.6545869443348256
Gain(Cost Rank) = 0.6604397204024708
Gain(Quality Rank) = 0.6555037878503662
