In [4]:
import pandas as pd
import os
import itertools
df = pd.read_csv('College.csv', sep=',')

print(df.shape)
df.head(1)

(777, 19)


Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60


In [5]:
df.Private.value_counts()

Yes    565
No     212
Name: Private, dtype: int64

In [8]:

df

#### Predict collage: Private/Public

#### Baseline: Logistic regression

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.utils import shuffle


In [103]:

df = pd.read_csv('College.csv', sep=',')

from sklearn.preprocessing import LabelEncoder
le =  LabelEncoder()
y =  le.fit_transform(df.loc[ :, 'Private'])
X =  df.iloc[ :,2:26]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

LGR = LogisticRegression(penalty='l2', 
                         dual=False, 
                         tol=0.0001, 
                         C=1.0, 
                         fit_intercept=True, 
                         intercept_scaling=1, 
                         class_weight=None, 
                         random_state=None, 
                         solver='lbfgs', 
                         max_iter=100, 
                         multi_class='warn', 
                         verbose=0, 
                         warm_start=False, 
                         n_jobs=None).fit(X_train, y_train)

accuracy_train = round(LGR.score(X_train, y_train), 4)
accuracy_test  = round(LGR.score(X_test,  y_test), 4)
predictions = LGR.predict(X_test)
probabilities = LGR.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

Accuracy - train: 0.9347
Accuracy - test:  0.9436

Confusion matrix:
TP: 136   TN: 48
FP: 5     FN: 6

Classification report: 

               precision    recall  f1-score   support

           0       0.89      0.91      0.90        53
           1       0.96      0.96      0.96       142

   micro avg       0.94      0.94      0.94       195
   macro avg       0.93      0.93      0.93       195
weighted avg       0.94      0.94      0.94       195



  return self.partial_fit(X, y)


#### Remove outliers

In [104]:
def remove_outlier(df_in, col_name):
    #print('Before', df_in[col_name].min(), df_in[col_name].max())
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-6.4*iqr
    fence_high = q3+6.4*iqr
    #print(iqr,fence_low, fence_high)
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    #print('After', df_out[col_name].min(), df_out[col_name].max())
    return df_out

df = pd.read_csv('College.csv', sep=',')
for column in df.columns[2:]:
    df = remove_outlier(df, column)

y =  le.fit_transform(df.loc[ :, 'Private'])
X =  df.iloc[ :,2:] # (df.shape[1]-2)]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

LGR = LogisticRegression(penalty='l1', 
                         dual=False, 
                         tol=0.0001, 
                         C=1.6, 
                         fit_intercept=True, 
                         intercept_scaling=1, 
                         class_weight=None, 
                         random_state=None, 
                         solver='liblinear', 
                         max_iter=100, 
                         multi_class='warn', 
                         verbose=0, 
                         warm_start=False, 
                         n_jobs=None).fit(X_train, y_train)

accuracy_train = round(LGR.score(X_train, y_train), 4)
accuracy_test  = round(LGR.score(X_test,  y_test), 4)
predictions = LGR.predict(X_test)
probabilities = LGR.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

Accuracy - train: 0.943
Accuracy - test:  0.9786

Confusion matrix:
TP: 136   TN: 47
FP: 2     FN: 2

Classification report: 

               precision    recall  f1-score   support

           0       0.96      0.96      0.96        49
           1       0.99      0.99      0.99       138

   micro avg       0.98      0.98      0.98       187
   macro avg       0.97      0.97      0.97       187
weighted avg       0.98      0.98      0.98       187



  return self.partial_fit(X, y)


#### Balance datasets

In [105]:
df.Private.value_counts()

Yes    555
No     193
Name: Private, dtype: int64

In [106]:
df.Private = le.fit_transform(df.Private)
negative = df[df.Private == 0]
positive = df[df.Private == 1].sample(len(negative))

dfb = pd.concat([negative,positive]).reset_index(drop=True)
#dfb =  add_features(dfb, 26)
dfb =  shuffle(dfb)

print('dfb shape:', dfb.shape)
print('Positives:', len(positive))
print('Negatives:', len(negative))
print('Sample cost:', len(df)-len(dfb))   

y =  dfb.loc[ :, 'Private']
X =  dfb.iloc[ :,2:]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

LGR = LogisticRegression(penalty='l1', 
                         dual=False, 
                         tol=0.0001, 
                         C=1.6, 
                         fit_intercept=True, 
                         intercept_scaling=1, 
                         class_weight=None, 
                         random_state=None, 
                         solver='liblinear', 
                         max_iter=100, 
                         multi_class='warn', 
                         verbose=0, 
                         warm_start=False, 
                         n_jobs=None).fit(X_train, y_train)

accuracy_train = round(LGR.score(X_train, y_train), 4)
accuracy_test  = round(LGR.score(X_test,  y_test), 4)
predictions = LGR.predict(X_test)
probabilities = LGR.predict_proba(X_test)
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

print('Accuracy - train: {}'.format(accuracy_train))
print('Accuracy - test:  {}'.format(accuracy_test))
print('\nConfusion matrix:\nTP: {:<5} TN: {}\nFP: {:<5} FN: {}'.format(tp, tn, fp, fn))
print('\nClassification report: \n\n', classification_report(y_test, predictions))

dfb shape: (386, 19)
Positives: 193
Negatives: 193
Sample cost: 362
Accuracy - train: 0.9377
Accuracy - test:  0.9175

Confusion matrix:
TP: 39    TN: 50
FP: 3     FN: 5

Classification report: 

               precision    recall  f1-score   support

           0       0.91      0.94      0.93        53
           1       0.93      0.89      0.91        44

   micro avg       0.92      0.92      0.92        97
   macro avg       0.92      0.91      0.92        97
weighted avg       0.92      0.92      0.92        97



  return self.partial_fit(X, y)


In [94]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000, max_depth=10000).fit(X_train, y_train) 

print(clf.score(X_test, y_test))

0.9484536082474226


In [95]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
svc = svm.SVC(gamma="scale")
parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 20]}
clf = GridSearchCV(svc, parameters, cv=10)
clf.fit(X_train, y_train) 

for result in sorted(clf.cv_results_.keys()):
    print(result, clf.cv_results_[result])

mean_fit_time [0.00135164 0.0026942  0.0010067  0.00174105 0.00114443 0.00145049
 0.00134754 0.00149176]
mean_score_time [0.00036571 0.00047414 0.00030594 0.00036912 0.00029843 0.00032694
 0.00029528 0.00032623]
mean_test_score [0.89273356 0.90311419 0.9100346  0.89965398 0.9100346  0.9100346
 0.91349481 0.92387543]
mean_train_score [0.90309991 0.90118712 0.91849202 0.91809999 0.93194772 0.93156309
 0.93502023 0.94693896]
param_C [0.1 0.1 1 1 10 10 20 20]
param_kernel ['linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf']
params [{'C': 0.1, 'kernel': 'linear'}, {'C': 0.1, 'kernel': 'rbf'}, {'C': 1, 'kernel': 'linear'}, {'C': 1, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'linear'}, {'C': 10, 'kernel': 'rbf'}, {'C': 20, 'kernel': 'linear'}, {'C': 20, 'kernel': 'rbf'}]
rank_test_score [8 6 3 7 3 3 2 1]
split0_test_score [0.83333333 0.86666667 0.9        0.86666667 0.86666667 0.86666667
 0.9        0.9       ]
split0_train_score [0.9034749  0.8957529  0.91891892 0.90733591 0.93050193 0.9



In [96]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC(gamma="scale")
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train) 
clf.score(X_test, y_test) 
for result in sorted(clf.cv_results_.keys()):
    print(result, clf.cv_results_[result])


mean_fit_time [0.00114388 0.00155058 0.00110254 0.00141602]
mean_score_time [0.00037847 0.00050011 0.00035315 0.0004096 ]
mean_test_score [0.89965398 0.9100346  0.91349481 0.92387543]
mean_train_score [0.91954273 0.91176912 0.9334033  0.93509745]
param_C [1 1 10 10]
param_kernel ['linear' 'rbf' 'linear' 'rbf']
params [{'C': 1, 'kernel': 'linear'}, {'C': 1, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'linear'}, {'C': 10, 'kernel': 'rbf'}]
rank_test_score [4 3 2 1]
split0_test_score [0.93220339 0.94915254 0.93220339 0.93220339]
split0_train_score [0.91304348 0.90869565 0.92608696 0.92173913]
split1_test_score [0.88135593 0.88135593 0.89830508 0.93220339]
split1_train_score [0.92173913 0.9173913  0.94782609 0.93478261]
split2_test_score [0.94736842 0.94736842 0.96491228 0.96491228]
split2_train_score [0.9137931  0.90086207 0.91810345 0.93534483]
split3_test_score [0.9122807  0.9122807  0.9122807  0.92982456]
split3_train_score [0.91810345 0.9137931  0.92241379 0.93965517]
split4_test_score [0.8



In [97]:
clf.cv_results_

{'mean_fit_time': array([0.00114388, 0.00155058, 0.00110254, 0.00141602]),
 'std_fit_time': array([2.42524526e-04, 5.46642591e-05, 1.35527875e-04, 1.92186163e-04]),
 'mean_score_time': array([0.00037847, 0.00050011, 0.00035315, 0.0004096 ]),
 'std_score_time': array([2.50271833e-05, 6.90500050e-05, 2.36423730e-05, 2.31671224e-05]),
 'param_C': masked_array(data=[1, 1, 10, 10],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'}],
 'split0_test_score': array([0.93220339, 0.94915254, 0.93220339, 0.93220339]),
 'split1_test_score': array([0.88135593, 0.88135593, 0.89830508, 0.93220339]),
 'split2_test_score': array([0.94736842, 0.94736842, 0

In [98]:
clf.best_estimator_ 

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)