In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

import statsmodels.api as sm

np.random.seed(12345)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = pd.read_excel('ENROLLMENT_DATA_F21.xlsx')

In [3]:
data.head()

Unnamed: 0,Target_Enroll,avg_income,CAMPUS_VISIT,CONTACT_CODE1,Contact_Date,Contact_Month,Contact_Year,distance,ETHNICITY,hscrat,...,telecq,TERRITORY,TOTAL_CONTACTS,TRAVEL_INIT_CNTCTS,Total,AllocProportion,SampleSize,ActualProportion,SelectionProb,SamplingWeight
0,0,,0,EML,1,Sep,2012,,,0.037652,...,,N,1,0,2580,0.5,2322,0.5,0.9,1.111111
1,0,,0,SAT,12,Feb,2014,,N,0.037652,...,,N,1,0,2580,0.5,2322,0.5,0.9,1.111111
2,0,,0,C01,16,Jan,2015,,C,0.037652,...,,N,1,0,2580,0.5,2322,0.5,0.9,1.111111
3,0,,0,EML,11,Mar,2015,,,0.037652,...,,N,1,0,2580,0.5,2322,0.5,0.9,1.111111
4,0,,0,TFL,28,Mar,2015,,B,0.037652,...,,N,2,0,2580,0.5,2322,0.5,0.9,1.111111


In [4]:
data.columns

Index(['Target_Enroll', 'avg_income', 'CAMPUS_VISIT', 'CONTACT_CODE1',
       'Contact_Date', 'Contact_Month', 'Contact_Year', 'distance',
       'ETHNICITY', 'hscrat', 'ID', 'init_span', 'Instate', 'int1rat',
       'int2rat', 'interest', 'IRSCHOOL', 'LEVEL_YEAR', 'mailq', 'premiere',
       'REFERRAL_CNTCTS', 'satscore', 'SELF_INIT_CNTCTS', 'sex',
       'SOLICITED_CNTCTS', 'telecq', 'TERRITORY', 'TOTAL_CONTACTS',
       'TRAVEL_INIT_CNTCTS', 'Total', 'AllocProportion', 'SampleSize',
       'ActualProportion', 'SelectionProb', 'SamplingWeight'],
      dtype='object')

In [5]:
data.describe()

Unnamed: 0,Target_Enroll,avg_income,CAMPUS_VISIT,Contact_Date,Contact_Year,distance,hscrat,ID,init_span,int1rat,...,SOLICITED_CNTCTS,telecq,TOTAL_CONTACTS,TRAVEL_INIT_CNTCTS,Total,AllocProportion,SampleSize,ActualProportion,SelectionProb,SamplingWeight
count,4644.0,3961.0,4644.0,4644.0,4644.0,4042.0,4644.0,4644.0,4644.0,4644.0,...,4644.0,1895.0,4644.0,4644.0,4644.0,4644.0,4644.0,4644.0,4644.0,4644.0
mean,0.5,53458.724817,0.149871,15.737295,2014.61348,316.31265,0.088487,44503.237941,19.182171,0.045547,...,0.585487,2.144063,4.422481,0.447674,2580.0,0.5,2322.0,0.5,0.9,1.111111
std,0.500054,23003.208372,0.374646,8.81325,0.722727,368.131106,0.145121,22305.093812,9.257648,0.03627,...,0.764699,0.807721,3.480882,0.670328,0.0,0.0,0.0,0.0,5.562816e-14,1.132549e-13
min,0.0,9783.0,0.0,1.0,2010.0,0.790555,0.0,32.0,-216.0,0.0,...,0.0,1.0,1.0,0.0,2580.0,0.5,2322.0,0.5,0.9,1.111111
25%,0.0,35544.0,0.0,7.0,2014.0,102.108996,0.024096,28013.75,12.0,0.020906,...,0.0,2.0,1.0,0.0,2580.0,0.5,2322.0,0.5,0.9,1.111111
50%,0.5,48589.0,0.0,16.0,2015.0,159.887072,0.052632,45662.0,18.0,0.04927,...,0.0,2.0,3.0,0.0,2580.0,0.5,2322.0,0.5,0.9,1.111111
75%,1.0,68458.0,0.0,23.0,2015.0,371.084211,0.095238,62609.25,23.0,0.04927,...,1.0,2.0,7.0,1.0,2580.0,0.5,2322.0,0.5,0.9,1.111111
max,1.0,200001.0,2.0,31.0,2016.0,3820.892094,1.0,82327.0,72.0,1.0,...,9.0,4.0,28.0,5.0,2580.0,0.5,2322.0,0.5,0.9,1.111111


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4644 entries, 0 to 4643
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Target_Enroll       4644 non-null   int64  
 1   avg_income          3961 non-null   float64
 2   CAMPUS_VISIT        4644 non-null   int64  
 3   CONTACT_CODE1       4639 non-null   object 
 4   Contact_Date        4644 non-null   int64  
 5   Contact_Month       4644 non-null   object 
 6   Contact_Year        4644 non-null   int64  
 7   distance            4042 non-null   float64
 8   ETHNICITY           4107 non-null   object 
 9   hscrat              4644 non-null   float64
 10  ID                  4644 non-null   int64  
 11  init_span           4644 non-null   int64  
 12  Instate             4644 non-null   object 
 13  int1rat             4644 non-null   float64
 14  int2rat             4644 non-null   float64
 15  interest            4644 non-null   int64  
 16  IRSCHO

# Data Preprocessing

In [7]:
catcols = data.select_dtypes(['object']).columns

In [8]:
catcols

Index(['CONTACT_CODE1', 'Contact_Month', 'ETHNICITY', 'Instate', 'IRSCHOOL',
       'LEVEL_YEAR', 'TERRITORY'],
      dtype='object')

In [9]:
# Turning objects into floats

for i in data.columns:
    if data[i].dtype == 'object':
        data[i]=data[i].str.decode('utf-8')

In [10]:
#Missing Values

data.isnull().sum()

Target_Enroll            0
avg_income             683
CAMPUS_VISIT             0
CONTACT_CODE1         4644
Contact_Date             0
Contact_Month         4644
Contact_Year             0
distance               602
ETHNICITY             4644
hscrat                   0
ID                       0
init_span                0
Instate               4644
int1rat                  0
int2rat                  0
interest                 0
IRSCHOOL              4644
LEVEL_YEAR            4644
mailq                    0
premiere                 0
REFERRAL_CNTCTS          0
satscore              1704
SELF_INIT_CNTCTS         0
sex                    111
SOLICITED_CNTCTS         0
telecq                2749
TERRITORY             4644
TOTAL_CONTACTS           0
TRAVEL_INIT_CNTCTS       0
Total                    0
AllocProportion          0
SampleSize               0
ActualProportion         0
SelectionProb            0
SamplingWeight           0
dtype: int64

In [16]:
#imputing missing values

for i in data.columns:
    if data[i].dtype =="float64":
        data[i].fillna(data[i].mean(), inplace=True)
    else:
        data[i].fillna(data[i].mode(), inplace=True)    

In [17]:
data.isnull().sum()

Target_Enroll            0
avg_income               0
CAMPUS_VISIT             0
CONTACT_CODE1         4644
Contact_Date             0
Contact_Month         4644
Contact_Year             0
distance                 0
ETHNICITY             4644
hscrat                   0
ID                       0
init_span                0
Instate               4644
int1rat                  0
int2rat                  0
interest                 0
IRSCHOOL              4644
LEVEL_YEAR            4644
mailq                    0
premiere                 0
REFERRAL_CNTCTS          0
satscore                 0
SELF_INIT_CNTCTS         0
sex                      0
SOLICITED_CNTCTS         0
telecq                   0
TERRITORY             4644
TOTAL_CONTACTS           0
TRAVEL_INIT_CNTCTS       0
Total                    0
AllocProportion          0
SampleSize               0
ActualProportion         0
SelectionProb            0
SamplingWeight           0
dtype: int64

In [19]:
# show the levels of the categorical variables

data[catcols].nunique()

CONTACT_CODE1    0
Contact_Month    0
ETHNICITY        0
Instate          0
IRSCHOOL         0
LEVEL_YEAR       0
TERRITORY        0
dtype: int64

In [22]:
#Dropping IRSCHOOL
data3 = data.drop(['IRSCHOOL', 'CONTACT_CODE1', 'Contact_Month', 'ETHNICITY', 'Instate', 'LEVEL_YEAR', 'TERRITORY'], axis=1)

In [23]:
data3.head()

Unnamed: 0,Target_Enroll,avg_income,CAMPUS_VISIT,Contact_Date,Contact_Year,distance,hscrat,ID,init_span,int1rat,...,SOLICITED_CNTCTS,telecq,TOTAL_CONTACTS,TRAVEL_INIT_CNTCTS,Total,AllocProportion,SampleSize,ActualProportion,SelectionProb,SamplingWeight
0,0,53458.724817,0,1,2012,316.31265,0.037652,32,48,0.017183,...,0,2.144063,1,0,2580,0.5,2322,0.5,0.9,1.111111
1,0,53458.724817,0,12,2014,316.31265,0.037652,51,31,0.017183,...,0,2.144063,1,0,2580,0.5,2322,0.5,0.9,1.111111
2,0,53458.724817,0,16,2015,316.31265,0.037652,120,20,0.017183,...,1,2.144063,1,0,2580,0.5,2322,0.5,0.9,1.111111
3,0,53458.724817,0,11,2015,316.31265,0.037652,151,18,0.017183,...,0,2.144063,1,0,2580,0.5,2322,0.5,0.9,1.111111
4,0,53458.724817,0,28,2015,316.31265,0.037652,160,18,0.017183,...,0,2.144063,2,0,2580,0.5,2322,0.5,0.9,1.111111


In [24]:
#Splitting data 

X= data3.drop(['Target_Enroll'], axis=1)
Y= data3['Target_Enroll'].astype('category')

In [25]:
#Split data into 70/30

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

# Q1 Logisitic Regression

In [26]:
log_model= LogisticRegression(penalty='l1', solver='liblinear', C=1e9).fit(X_train, y_train)

In [27]:
accuracy_score(log_model.predict(X_test), y_test)

0.8995695839311334

# Q2 Stepwise Regression

In [36]:
#This is the stepwise regression function

def stepwise_selection(data3, target, SL_in = 0.05, SL_out = 0.05):
    initial_features = data3.columns.tolist()
    best_features=[]
    
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features, dtype='float64')
        
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column]= model.pvalues[new_column]
        min_p_value = new_pval.min()
        
        if(min_p_value < SL_in):
            best_features.append(new_pval.idxmin())
            
            while(len(best_features)>0):
                best_features_with_constant = sm.add_constant(data[best_features])
                p_values = sm.OLS(target, best_features_with_constant).fit().pvalues[1:]
                
                max_p_value = p_values.max()
                if(max_p_value >= SL_out):
                    excluded_feature = p_values.idxmax()
                    best_features.remove(excluded_feature)
                else:
                    break
        else:
            break
    return best_features

In [37]:
X1 = stepwise_selection(X,Y)

In [38]:
#selected variables
print(X1)

['AllocProportion', 'Total', 'SamplingWeight', 'SampleSize', 'SelectionProb', 'SELF_INIT_CNTCTS', 'hscrat', 'telecq', 'ActualProportion', 'distance', 'init_span', 'avg_income', 'interest', 'int1rat', 'CAMPUS_VISIT', 'premiere', 'satscore', 'REFERRAL_CNTCTS', 'int2rat', 'Contact_Year', 'sex', 'mailq', 'TRAVEL_INIT_CNTCTS']


In [41]:
X_stepwise = data3[X1]

In [43]:
#Split that data

X_train, X_test, y_train, y_test = train_test_split(X_stepwise, Y, test_size=0.3)

In [44]:
#logistic modeling

log_model = LogisticRegression(penalty='l1', solver='liblinear', C = 1e9).fit(X_train, y_train)

In [45]:
accuracy_score(log_model.predict(X_test), y_test)

0.9117647058823529

# Q3 Decision Tree 

In [47]:
#Split that data

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

In [49]:
#Decision Tree

DT_model = DecisionTreeClassifier().fit(X_train, y_train)

accuracy_score(DT_model.predict(X_test), y_test)

0.8945480631276901

# Q4 Gradient Boosting

In [50]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

#accuracy
accuracy_score(gb_model.predict(X_test), y_test)

0.9404591104734576

# Q5 Random Forest 

In [51]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

#accuracy
accuracy_score(rf_model.predict(X_test), y_test)

0.9383070301291249

# Q6 SVM

Linear Kernel

In [52]:
svm_model = SVC(kernel='linear').fit(X_train, y_train)

#accuracy
accuracy_score(svm_model.predict(X_test), y_test)

0.8923959827833573

Polynomial Kernel

In [53]:
svm_model = SVC(kernel='poly').fit(X_train, y_train)

#accuracy
accuracy_score(svm_model.predict(X_test), y_test)

0.5911047345767575

Gaussian Kernel (RBF)

In [54]:
svm_model = SVC(kernel= 'rbf').fit(X_train, y_train)

accuracy_score(svm_model.predict(X_test), y_test)

0.599713055954089