Question 1 - Logistic Regression Model

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [2]:
#Read csv file
data = pd.read_csv("/Users/HM/Desktop/BAN5753/Exercises/Exercise7/enrollment.csv")

In [3]:
#ID, IRSCHOOL and LEVEL_YEAR are droped from the data set
data.drop(['ID'],axis=1, inplace=True)

In [4]:
data.drop(['IRSCHOOL'],axis=1, inplace=True)

In [5]:
data.drop(['LEVEL_YEAR'],axis=1, inplace=True)

In [6]:
#List of nomial variables is created
NominalVariables=['ETHNICITY', 'TERRITORY','CONTACT_CODE1', 'Contact_Date', 'Contact_Month', 'Contact_Year']

In [7]:
#Data set is copied to df for imputation
df=data.copy(deep=True)

In [8]:
#Instate variable is convereted in 0 and 1 form
df['Instate']=np.where(df['Instate']=='Y',1,0)

In [9]:
#Imputation of nominal variables with Mode value
for column in ['CONTACT_CODE1','ETHNICITY', 'sex']:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [10]:
#Imputation of interval variables with Median value
for column in ['avg_income', 'distance','satscore','telecq']:
    df[column].fillna(df[column].median(), inplace=True)

In [11]:
#Dummies are created for all the nominal variables in the NominalVariables list
df=pd.get_dummies(df, columns = NominalVariables,drop_first=True)

In [12]:
#Logistic Regression Function
def lr(x,y): 
    logistic_regression_model = LogisticRegression(penalty='l2',random_state=12345)
    logistic_regression_model.fit(x, y)
    return logistic_regression_model

In [13]:
#Accuracy Function
def accuracy(trained_model, features, targets):
    accuracy_score = trained_model.score(features, targets)
    return accuracy_score

In [14]:
#Data Partitioning
df_x_train, df_x_val, df_y_train, df_y_val= train_test_split(df[df.columns[~df.columns.isin(['Target'])]],
                                                df['Target'],test_size=0.30, random_state=12345)

In [15]:
#Calling the Logistic Regression Function
lr1 = lr(df_x_train, df_y_train)



In [16]:
#Calling the Accuracy Function
acc1_train=accuracy(lr1,df_x_train,df_y_train)

In [17]:
#Accuracy of Training data set
acc1_train

0.9066998892580288

In [18]:
#Calling the Accuracy Function
acc1_val=accuracy(lr1,df_x_val,df_y_val)

In [19]:
#Accuracy of Validation data set
acc1_val

0.8901808785529716

Question 2 - Decision Tree Model

In [20]:
#Data set is copied to df1
df1=data.copy(deep=True)

In [21]:
#NA values are dropped
df1.dropna(inplace=True)

In [22]:
#Instate variable is convereted in 0 and 1 form
df1['Instate']=np.where(df1['Instate']=='Y',1,0)

In [23]:
#Dummies are created for all the nominal variables in the NominalVariables list
df1=pd.get_dummies(df1, columns = NominalVariables,drop_first=True)

In [24]:
#Decision Tree Function
def dt(x,y,criterion):
    Decision_model = DecisionTreeClassifier(criterion=criterion,random_state=12345)
    Decision_model.fit(x, y)
    return Decision_model

In [25]:
#Data Partitioning
#This data set is used for Decision Tree, Random Forest Models and Gradient Boosting Model
df1_x_train, df1_x_val, df1_y_train, df1_y_val= train_test_split(df1[df1.columns[~df1.columns.isin(['Target'])]],
                                                df1['Target'],test_size=0.30, random_state=12345)

In [26]:
#Calling the Decision Tree Function
dt_model = dt(df1_x_train,df1_y_train,'entropy')

In [27]:
#Calling the Accuracy Function
acc2_train=accuracy(dt_model,df1_x_train,df1_y_train)

In [28]:
#Accuracy of Training data set
acc2_train

1.0

In [29]:
#Calling the Accuracy Function
acc2_val=accuracy(dt_model,df1_x_val,df1_y_val)

In [30]:
#Accuracy of Validation data set
acc2_val

0.87279843444227

Question 3 - Support Vector Machine

In [31]:
#Data set is copied to df2
df2=data.copy(deep=True)

In [32]:
#List of insignificant variables which will be removed
InsignificantVariables=['avg_income', 'CONTACT_CODE1','Contact_Date', 'Contact_Month', 'init_span','Instate',
                    'int1rat','REFERRAL_CNTCTS','SELF_INIT_CNTCTS','sex','telecq','TERRITORY','TRAVEL_INIT_CNTCTS']

In [33]:
#Insignificant variables are droped from the data set
df2.drop(InsignificantVariables, axis=1, inplace=True)

In [34]:
#Imputation of nominal variable with Mode value
for column in ['ETHNICITY']:
    df2[column].fillna(df2[column].mode()[0], inplace=True)

In [35]:
#Imputation of interval variables with Median value
for column in ['distance','satscore']:
    df2[column].fillna(df2[column].median(), inplace=True)

In [36]:
#List of nomial variables is created
NominalVariables_SVM=['ETHNICITY', 'Contact_Year']

In [37]:
#Dummies are created for all the nominal variables in the NominalVariables list
df2=pd.get_dummies(df2, columns = NominalVariables_SVM,drop_first=True)

In [38]:
#Support Vector Machine Function
def svm(x, y,kernel):
    SVM_model = SVC(kernel=kernel,random_state=12345)
    SVM_model.fit(x, y)
    return SVM_model

In [39]:
#Data Partitioning
df2_x_train, df2_x_val, df2_y_train, df2_y_val= train_test_split(df2[df2.columns[~df2.columns.isin(['Target'])]],
                                                df2['Target'],test_size=0.30, random_state=12345)

In [40]:
# SVM modle with Liner Optimization
sv1 = svm(df2_x_train,df2_y_train,'linear')

In [41]:
#Calling the Accuracy Function
acc_sv1_train=accuracy(sv1,df2_x_train,df2_y_train)

In [42]:
#Accuracy of Training data set
acc_sv1_train

0.8859357696566998

In [43]:
#Calling the Accuracy Function
acc_sv1_val=accuracy(sv1,df2_x_val,df2_y_val)

In [44]:
#Accuracy of Validation data set
acc_sv1_val

0.8837209302325582

In [45]:
# SVM modle with RBF Active Set Optimization
sv3 = svm(df2_x_train,df2_y_train,'rbf')



In [46]:
#Calling the Accuracy Function
acc_sv3_train=accuracy(sv3,df2_x_train,df2_y_train)

In [47]:
#Accuracy of Training data set
acc_sv3_train

0.9706533776301218

In [48]:
#Calling the Accuracy Function
acc_sv3_val=accuracy(sv3,df2_x_val,df2_y_val)

In [49]:
#Accuracy of Validation data set
acc_sv3_val

0.8456072351421189

In [50]:
# SVM modle with Sigmoid Active Set Optimization
sv4 = svm(df2_x_train,df2_y_train,'sigmoid')



In [51]:
#Calling the Accuracy Function
acc_sv4_train=accuracy(sv4,df2_x_train,df2_y_train)

In [52]:
#Accuracy of Training data set
acc_sv4_train

0.5008305647840532

In [53]:
#Calling the Accuracy Function
acc_sv4_val=accuracy(sv4,df2_x_val,df2_y_val)

In [54]:
#Accuracy of Validation data set
acc_sv4_val

0.49806201550387597

Question 4 - Random Forest Model

In [55]:
#Random Forest Model Function
def rf(x, y,estimator):
    RandomForest_model = RandomForestClassifier(n_estimators=estimator, criterion='entropy',random_state=12345)
    RandomForest_model.fit(x, y)
    return RandomForest_model

In [56]:
#First Random Forest mmodel with number of trees as 100 
rf1 = rf(df1_x_train,df1_y_train,100)

In [57]:
#Calling the Accuracy Function
acc_rf1_train=accuracy(rf1,df1_x_train,df1_y_train)

In [58]:
#Accuracy of Training data set
acc_rf1_train

1.0

In [59]:
#Calling the Accuracy Function
acc_rf1_val=accuracy(rf1,df1_x_val,df1_y_val)

In [60]:
#Accuracy of Validation data set
acc_rf1_val

0.9021526418786693

In [61]:
#Second Random Forest mmodel with number of trees as 200 
rf2 = rf(df1_x_train,df1_y_train,200)

In [62]:
#Calling the Accuracy Function
acc_rf2_train=accuracy(rf2,df1_x_train,df1_y_train)

In [63]:
#Accuracy of Training data set
acc_rf2_train

1.0

In [64]:
#Calling the Accuracy Function
acc_rf2_val=accuracy(rf2,df1_x_val,df1_y_val)

In [65]:
#Accuracy of Validation data set
acc_rf2_val

0.9041095890410958

Question 5 - Gradient Boosting Model

In [66]:
#Gradient Boosting Function
def gb(x,y,estimator):
    GradientBoosting_model = GradientBoostingClassifier(n_estimators=estimator, learning_rate=0.1,max_depth=2,
                                                        validation_fraction=0.4,random_state=12345)
    GradientBoosting_model.fit(x, y)
    return GradientBoosting_model

In [67]:
#Calling the Gradient Boosting Model
gb_model = gb(df1_x_train,df1_y_train,50)

In [68]:
#Calling the Accuracy Function
acc_gb_train=accuracy(gb_model,df1_x_train,df1_y_train)

In [69]:
#Accuracy of Training data set
acc_gb_train

0.9303106633081444

In [70]:
#Calling the Accuracy Function
acc_gb_val=accuracy(gb_model,df1_x_val,df1_y_val)

In [71]:
#Accuracy of Validation data set
acc_gb_val

0.9119373776908023