In [1]:
import numpy as np 
import pandas as pd
import random
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold
import sklearn.model_selection as skl_model
from sklearn.preprocessing import OneHotEncoder
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

## 1. Linear Regression using a simple perceptron 

### For  getting  into  graduate  school,  download  the grad school attribute data set from Files.  

#### a) Define the features that are related to predicting the Chance of Admit, and normalize them. 

In [2]:
df = pd.read_csv("Admission_Predict_Ver1.1.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
# drop the serial number
df = df.drop("Serial No.", axis=1)
chance = df["Chance of Admit "]
df = df.drop("Chance of Admit ", axis=1)
#normalize the rest of the data 
df = (df - df.mean()) / df.std()
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1.817417,1.777086,0.774806,1.136222,1.097845,1.775029,0.885518
1,0.666481,-0.031569,0.774806,0.631683,1.097845,0.485373,0.885518
2,-0.041788,-0.524839,-0.099693,-0.377395,0.017289,-0.953088,0.885518
3,0.489414,0.4617,-0.099693,0.127144,-1.063267,0.154692,0.885518
4,-0.218855,-0.689262,-0.974192,-1.386473,-0.522989,-0.605873,-1.127023


####  b) Fill in the code for a simple perceptron. Initialize your weights and biases between 0 and 0.05 using your random number generator. Use the mean squares error (MSE) to adjust your weights through back-propagation. Fill in the blanks for the feedforward and backpropagation step.

In [4]:
class simple_perceptron():
    def __init__(self,input_dim,output_dim,learning_rate=0.01,activation=lambda x:x,activation_grad=lambda x:1):
        
        self.input_dim=input_dim
        self.output_dim=output_dim
        self.activation=activation
        self.activation_grad=activation_grad
        self.lr=learning_rate
        ### initialize weights between 0 and 0.05 ###
        self.weights=np.random.rand(input_dim,output_dim)*0.05
        self.biases=np.random.rand(1,output_dim)*0.05
        
    def predict(self,X):
        if len(X.shape)==1:
            X=X.reshape((-1,1))
        dim=X.shape[1]
        # Check that the dimension of accepted input data is the same as expected
        if not dim==self.input_dim:
            raise Exception("Expected input size %d, accepted %d!"%(self.input_dim,dim))
        ### Calculate logit and activation ### 
        self.z=X.dot(self.weights)+self.biases
        self.a=self.activation(self.z)
        return self.a
    
    def fit(self,X,y):
        # Transform the single-sample data into 2-dimensional, for the convenience of matrix multiplication
        if len(X.shape)==1:
            X=X.reshape((-1,1))
        if len(y.shape)==1:
            y=y.reshape((-1,1))
        self.predict(X)
        errors=(self.a-y)*self.activation_grad(self.z)
        weights_grad=errors.T.dot(X)
        bias_grad=np.sum(errors,axis=0)
        ### Update weights and biases from the gradient ###
        self.weights -= weights_grad.T*self.lr
        self.biases -= bias_grad*self.lr
        
    def train_on_epoch(self,X,y,batch_size=32):
        # Every time select batch_size samples from the training set, until all data in the training set has been trained once
        order=list(range(X.shape[0]))
        random.shuffle(order)
        n=0
        while n<math.ceil(len(order)/batch_size)-1: # Parts that can fill one batch
            self.fit(X[order[n*batch_size:(n+1)*batch_size]],y[order[n*batch_size:(n+1)*batch_size]])
            n+=1
        # Parts that cannot fill one batch
        self.fit(X[order[n*batch_size:]],y[order[n*batch_size:]])
        
    def evaluate(self,X,y):
         # Transform the single-sample data into 2-dimensional
        if len(X.shape)==1:
            X=X.reshape((1,-1))
        if len(y.shape)==1:
            y=y.reshape((1,-1))
        ### means square error ###
        return np.mean((self.predict(X)-y)**2)
    
    def get_weights(self):
        return (self.weights,self.biases)
    
    def set_weights(self,weights):
        self.weights=weights[0]
        self.biases=weights[1]

#### c) Fill in the code for k-fold validation. Use 80% of the data for training and 20% of the data for testing and do 5-fold validation. Use a learning rate of 0.0001. Are the features good indicators for getting into graduate school? Now remove the GRE scores and do the same test; are GRE scores important?

In [5]:
def Kfold(k,Xs,ys,epochs,learning_rate=0.0001,draw_curve=False):
    # The total number of examples for training the network
    total_num=len(Xs)
    
    # Built in K-fold function in Sci-Kit Learn
    kf=KFold(n_splits=k,shuffle=True)
    # record error for each model
    train_error_all=[]
    test_error_all=[]
    
    for train_selector,test_selector in kf.split(range(total_num)):
        ### Decide training examples and testing examples for this fold ###
        train_Xs=Xs[train_selector]
        test_Xs=Xs[test_selector]
        train_ys=ys[train_selector]
        test_ys=ys[test_selector]

    val_array=[]
    # Split training examples further into training and validation
    train_in,val_in,train_real,val_real=train_test_split(train_Xs,train_ys)
        
    ### Establish the model for simple perceptron here ###
    model=simple_perceptron(Xs.shape[1], 1, learning_rate=learning_rate)
        
    # Save the lowest weights, so that we can recover the best model
    weights = model.get_weights()
    lowest_val_err = np.inf
    for _ in range(epochs):
        # Train model on a number of epochs, and test performance in the validation set
        model.train_on_epoch(train_in,train_real)
        val_err = model.evaluate(val_in,val_real)
        val_array.append(val_err)
        if val_err < lowest_val_err:
            lowest_val_err = val_err
            weights = model.get_weights()
        # The final number of epochs is when the minimum error in validation set occurs    
        final_epochs=np.argmin(val_array)+1
        print("Number of epochs with lowest validation:",final_epochs)
        # Recover the model weight
        model.set_weights(weights)

        # Report result for this fold
        train_error=model.evaluate(train_Xs,train_ys)
        train_error_all.append(train_error)
        test_error=model.evaluate(test_Xs,test_ys)
        test_error_all.append(test_error)
        print("Train error:",train_error)
        print("Test error:",test_error)
        
        if draw_curve:
            plt.figure()
            plt.plot(np.arange(len(val_array))+1,val_array,label='Validation loss')
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.legend()
            
    print("Final results:")
    print("Training error:%f+-%f"%(np.average(train_error_all),np.std(train_error_all)))
    print("Testing error:%f+-%f"%(np.average(test_error_all),np.std(test_error_all)))
    
    # return the last model
    return model

In [6]:
def show_correlation(xs,ys):
    plt.figure()
    plt.scatter(xs,ys,s=0.5)
    r = [np.min([np.min(xs),np.min(ys)]),np.max([np.max(xs),np.max(ys)])]
    plt.plot(r,r,'r')
    plt.xlabel("Predictions")
    plt.ylabel("Ground truth")
    corr=np.corrcoef([xs,ys])[1,0]
    print("Correlation coefficient:",corr)

#### Testing the code above. Please note that I have assigned False to draw curve because my jupyter notebook kept crashing

In [7]:
import sklearn.model_selection as skl_model
Xs = df.to_numpy() 
ys = chance.to_numpy()

train_feat,test_feat,train_ranking,test_ranking=skl_model.train_test_split(Xs, ys, test_size = 0.20)
res = Kfold(k=5, Xs=train_feat, ys=train_ranking, epochs=1000)

Number of epochs with lowest validation: 1
Train error: 0.5270166307157261
Test error: 0.5337397908318257
Number of epochs with lowest validation: 2
Train error: 0.5047387640916025
Test error: 0.5121574461255333
Number of epochs with lowest validation: 3
Train error: 0.4834924120397273
Test error: 0.4915065622835508
Number of epochs with lowest validation: 4
Train error: 0.4631898958349122
Test error: 0.4716883119145267
Number of epochs with lowest validation: 5
Train error: 0.4438544380434648
Test error: 0.4527853553026307
Number of epochs with lowest validation: 6
Train error: 0.42537611345076093
Test error: 0.4346460851098549
Number of epochs with lowest validation: 7
Train error: 0.407740557351851
Test error: 0.41728675757107697
Number of epochs with lowest validation: 8
Train error: 0.3908987167169507
Test error: 0.40066304664218777
Number of epochs with lowest validation: 9
Train error: 0.37475708514326106
Test error: 0.3846458055901454
Number of epochs with lowest validation: 10

Train error: 0.03788776502505589
Test error: 0.041760238329480795
Number of epochs with lowest validation: 141
Train error: 0.03784757011781972
Test error: 0.04171111313668706
Number of epochs with lowest validation: 142
Train error: 0.03780322696049568
Test error: 0.04165702622742895
Number of epochs with lowest validation: 143
Train error: 0.037762026006772614
Test error: 0.04160651448404005
Number of epochs with lowest validation: 144
Train error: 0.03772440717733437
Test error: 0.04156011048469736
Number of epochs with lowest validation: 145
Train error: 0.037696716942380075
Test error: 0.04152517387904731
Number of epochs with lowest validation: 146
Train error: 0.03766503104832752
Test error: 0.04148553030071666
Number of epochs with lowest validation: 147
Train error: 0.03763199276752768
Test error: 0.041444228742321426
Number of epochs with lowest validation: 148
Train error: 0.03759595963329022
Test error: 0.0413993720656157
Number of epochs with lowest validation: 149
Train e

Train error: 0.03677837735737632
Test error: 0.04011100434162747
Number of epochs with lowest validation: 261
Train error: 0.03677945422791172
Test error: 0.040109942313915
Number of epochs with lowest validation: 261
Train error: 0.03678264963473188
Test error: 0.04011130118889733
Number of epochs with lowest validation: 261
Train error: 0.03677835010229316
Test error: 0.040104111180771354
Number of epochs with lowest validation: 266
Train error: 0.03677689715144786
Test error: 0.04010021507818545
Number of epochs with lowest validation: 267
Train error: 0.03677416641259647
Test error: 0.04009488075318805
Number of epochs with lowest validation: 268
Train error: 0.03677298549218916
Test error: 0.040091313262972925
Number of epochs with lowest validation: 268
Train error: 0.03677594801622985
Test error: 0.04009243896197584
Number of epochs with lowest validation: 270
Train error: 0.036773114415476565
Test error: 0.040087021331342995
Number of epochs with lowest validation: 270
Train er

Train error: 0.036783062773625266
Test error: 0.039892964667970536
Number of epochs with lowest validation: 390
Train error: 0.036782068636900964
Test error: 0.03989073111125736
Number of epochs with lowest validation: 390
Train error: 0.036784782957449656
Test error: 0.03989268882376893
Number of epochs with lowest validation: 390
Train error: 0.03678502262892184
Test error: 0.03989186786511181
Number of epochs with lowest validation: 390
Train error: 0.03678121921579527
Test error: 0.03988646632758594
Number of epochs with lowest validation: 390
Train error: 0.03677775068652756
Test error: 0.0398814772594737
Number of epochs with lowest validation: 403
Train error: 0.036775531757674715
Test error: 0.03987789736067361
Number of epochs with lowest validation: 403
Train error: 0.036781492659997354
Test error: 0.03988353439679107
Number of epochs with lowest validation: 403
Train error: 0.0367847990729513
Test error: 0.039886157268791314
Number of epochs with lowest validation: 403
Train

Train error: 0.036817269293147395
Test error: 0.03982094223829195
Number of epochs with lowest validation: 505
Train error: 0.036819623897814596
Test error: 0.03982310301735252
Number of epochs with lowest validation: 505
Train error: 0.036817035770136855
Test error: 0.039819685213355396
Number of epochs with lowest validation: 505
Train error: 0.036821729950486624
Test error: 0.03982446546950966
Number of epochs with lowest validation: 505
Train error: 0.03682235560482397
Test error: 0.039824608510412134
Number of epochs with lowest validation: 505
Train error: 0.036822301652457395
Test error: 0.03982400265972862
Number of epochs with lowest validation: 505
Train error: 0.036824315634508165
Test error: 0.039825740746553324
Number of epochs with lowest validation: 505
Train error: 0.0368277900017875
Test error: 0.0398290924237957
Number of epochs with lowest validation: 505
Train error: 0.03682773392382097
Test error: 0.0398285159127866
Number of epochs with lowest validation: 505
Trai

Number of epochs with lowest validation: 608
Train error: 0.036859119114399326
Test error: 0.039822858671596094
Number of epochs with lowest validation: 608
Train error: 0.03685411344663138
Test error: 0.03981695157307831
Number of epochs with lowest validation: 608
Train error: 0.03685862679768508
Test error: 0.03982171181550433
Number of epochs with lowest validation: 608
Train error: 0.036860412452010986
Test error: 0.03982340699594614
Number of epochs with lowest validation: 608
Train error: 0.036855874259758656
Test error: 0.03981801701776078
Number of epochs with lowest validation: 608
Train error: 0.036858947699997494
Test error: 0.03982120774105706
Number of epochs with lowest validation: 608
Train error: 0.036861725198535734
Test error: 0.039823977336151305
Number of epochs with lowest validation: 608
Train error: 0.03686164420593627
Test error: 0.03982361280282341
Number of epochs with lowest validation: 608
Train error: 0.036865120426366346
Test error: 0.039827259440531844
N

Train error: 0.036882652328192454
Test error: 0.03982078247576056
Number of epochs with lowest validation: 608
Train error: 0.03688219101895815
Test error: 0.039820134112187586
Number of epochs with lowest validation: 608
Train error: 0.0368759093232553
Test error: 0.03981301725414182
Number of epochs with lowest validation: 608
Train error: 0.0368747990314663
Test error: 0.03981162246096399
Number of epochs with lowest validation: 608
Train error: 0.03687385778396861
Test error: 0.039810436137520305
Number of epochs with lowest validation: 608
Train error: 0.03687579242421519
Test error: 0.039812441142920874
Number of epochs with lowest validation: 608
Train error: 0.03687713602456399
Test error: 0.03981378587651694
Number of epochs with lowest validation: 608
Train error: 0.03687998517629043
Test error: 0.03981684884596276
Number of epochs with lowest validation: 608
Train error: 0.03687927336222319
Test error: 0.03981590179117579
Number of epochs with lowest validation: 608
Train er

Train error: 0.03689418801177271
Test error: 0.03981634019901147
Number of epochs with lowest validation: 608
Train error: 0.036901970790070765
Test error: 0.039825017219330155
Number of epochs with lowest validation: 608
Train error: 0.036894801378426385
Test error: 0.03981692889156358
Number of epochs with lowest validation: 608
Train error: 0.03689721022760205
Test error: 0.039819600310220644
Number of epochs with lowest validation: 608
Train error: 0.03689860957986047
Test error: 0.03982111471500753
Number of epochs with lowest validation: 608
Train error: 0.03689534101457652
Test error: 0.039817398072708475
Number of epochs with lowest validation: 608
Train error: 0.03689606047709582
Test error: 0.03981816580415139
Number of epochs with lowest validation: 608
Train error: 0.03689472696347205
Test error: 0.03981663392644526
Number of epochs with lowest validation: 608
Train error: 0.03689298242693998
Test error: 0.03981465052852446
Number of epochs with lowest validation: 608
Train

In [16]:
prediction = res.predict(test_feat)
test_ranking.reshape(100,1)
show_correlation(prediction, test_ranking)

<IPython.core.display.Javascript object>

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 100) + inhomogeneous part.

#### Removing GRE scores and then testing the above code 

In [8]:
df = df.drop("GRE Score", axis=1)
Xs = df.to_numpy() 
ys = chance.to_numpy()

train_feat,test_feat,train_ranking,test_ranking=skl_model.train_test_split(Xs, ys, test_size = 0.20)
res = Kfold(k=5, Xs=train_feat, ys=train_ranking, epochs=1000)

Number of epochs with lowest validation: 1
Train error: 0.4835221771125241
Test error: 0.4525505259145335
Number of epochs with lowest validation: 2
Train error: 0.46255539257106704
Test error: 0.4325639090444405
Number of epochs with lowest validation: 3
Train error: 0.4425653730033964
Test error: 0.4135058189987795
Number of epochs with lowest validation: 4
Train error: 0.42349908959467614
Test error: 0.3953250499390189
Number of epochs with lowest validation: 5
Train error: 0.40537540927575777
Test error: 0.3780518204551172
Number of epochs with lowest validation: 6
Train error: 0.38806979064720487
Test error: 0.36155138271663245
Number of epochs with lowest validation: 7
Train error: 0.37158808006011257
Test error: 0.3458385421589899
Number of epochs with lowest validation: 8
Train error: 0.3558910081406623
Test error: 0.3308778119105942
Number of epochs with lowest validation: 9
Train error: 0.34090384060236795
Test error: 0.3165904722005156
Number of epochs with lowest validation

Train error: 0.038938792924605045
Test error: 0.033786675530240076
Number of epochs with lowest validation: 122
Train error: 0.03887667620885575
Test error: 0.03375439128252488
Number of epochs with lowest validation: 123
Train error: 0.038816731292524005
Test error: 0.03372381380510896
Number of epochs with lowest validation: 124
Train error: 0.03875895636433746
Test error: 0.03369492011963843
Number of epochs with lowest validation: 125
Train error: 0.038704078401373215
Test error: 0.033668385554511265
Number of epochs with lowest validation: 126
Train error: 0.03864989422698832
Test error: 0.033642251851990754
Number of epochs with lowest validation: 127
Train error: 0.03860930302036817
Test error: 0.03362742251168358
Number of epochs with lowest validation: 128
Train error: 0.03855865462089438
Test error: 0.03360395480959421
Number of epochs with lowest validation: 129
Train error: 0.038515489706807744
Test error: 0.0335865954981124
Number of epochs with lowest validation: 130
Trai

Number of epochs with lowest validation: 198
Train error: 0.03777257252681281
Test error: 0.03409724843361893
Number of epochs with lowest validation: 198
Train error: 0.037777544086875746
Test error: 0.03410375690680939
Number of epochs with lowest validation: 198
Train error: 0.03778281630380294
Test error: 0.034110491781983276
Number of epochs with lowest validation: 198
Train error: 0.037787305222967624
Test error: 0.03411651585143969
Number of epochs with lowest validation: 198
Train error: 0.03778848721193901
Test error: 0.03411979843046999
Number of epochs with lowest validation: 198
Train error: 0.037793079836446114
Test error: 0.03412587319668879
Number of epochs with lowest validation: 198
Train error: 0.03780003620708763
Test error: 0.034133878554496254
Number of epochs with lowest validation: 198
Train error: 0.03780292954048157
Test error: 0.03413850737038144
Number of epochs with lowest validation: 198
Train error: 0.037804390034033714
Test error: 0.03414190365317423
Numb

Number of epochs with lowest validation: 198
Train error: 0.03794005584255375
Test error: 0.03442106174961394
Number of epochs with lowest validation: 198
Train error: 0.03793967099851939
Test error: 0.03442123314042731
Number of epochs with lowest validation: 198
Train error: 0.03794112839592105
Test error: 0.03442289864406169
Number of epochs with lowest validation: 198
Train error: 0.03794175104300549
Test error: 0.03442382766749133
Number of epochs with lowest validation: 198
Train error: 0.03794396174981214
Test error: 0.03442610384502753
Number of epochs with lowest validation: 198
Train error: 0.037943758775764364
Test error: 0.03442639737315657
Number of epochs with lowest validation: 198
Train error: 0.03794217237929964
Test error: 0.03442557591081184
Number of epochs with lowest validation: 198
Train error: 0.037944725509685
Test error: 0.034428118600219174
Number of epochs with lowest validation: 198
Train error: 0.03794491644180836
Test error: 0.0344287008887991
Number of e

Train error: 0.03798473130908208
Test error: 0.03449508353820863
Number of epochs with lowest validation: 198
Train error: 0.037985595192964315
Test error: 0.03449605688927575
Number of epochs with lowest validation: 198
Train error: 0.03798339617926138
Test error: 0.034494473964945864
Number of epochs with lowest validation: 198
Train error: 0.037984870506371345
Test error: 0.03449594036112189
Number of epochs with lowest validation: 198
Train error: 0.03798811154951777
Test error: 0.0344988736045818
Number of epochs with lowest validation: 198
Train error: 0.03799222902205856
Test error: 0.03450252669111387
Number of epochs with lowest validation: 198
Train error: 0.03798960588735893
Test error: 0.03450062793647911
Number of epochs with lowest validation: 198
Train error: 0.03799267861771414
Test error: 0.03450335235939889
Number of epochs with lowest validation: 198
Train error: 0.037989253685085704
Test error: 0.034500807412145125
Number of epochs with lowest validation: 198
Train 

Number of epochs with lowest validation: 198
Train error: 0.03801191416662724
Test error: 0.03453721506186944
Number of epochs with lowest validation: 198
Train error: 0.03801156077312763
Test error: 0.034537068300795615
Number of epochs with lowest validation: 198
Train error: 0.038009783043120166
Test error: 0.03453570346753561
Number of epochs with lowest validation: 198
Train error: 0.03801204865593635
Test error: 0.03453769741629968
Number of epochs with lowest validation: 198
Train error: 0.03801326598607331
Test error: 0.03453882676843266
Number of epochs with lowest validation: 198
Train error: 0.03801330924332531
Test error: 0.03453902898208512
Number of epochs with lowest validation: 198
Train error: 0.038016128403134654
Test error: 0.03454149352345509
Number of epochs with lowest validation: 198
Train error: 0.03802088254276267
Test error: 0.03454552099281001
Number of epochs with lowest validation: 198
Train error: 0.038022127971198035
Test error: 0.03454667377282079
Number

Number of epochs with lowest validation: 198
Train error: 0.03802750441530325
Test error: 0.034565975635806254
Number of epochs with lowest validation: 198
Train error: 0.03802715890520695
Test error: 0.03456574761352359
Number of epochs with lowest validation: 198
Train error: 0.038022009834698146
Test error: 0.03456157150758358
Number of epochs with lowest validation: 198
Train error: 0.03802320615087479
Test error: 0.03456254650111572
Number of epochs with lowest validation: 198
Train error: 0.038019694317071896
Test error: 0.03455973120495381
Number of epochs with lowest validation: 198
Train error: 0.03802332414046048
Test error: 0.03456272164837831
Number of epochs with lowest validation: 198
Train error: 0.0380212759407774
Test error: 0.034561107231978125
Number of epochs with lowest validation: 198
Train error: 0.03802043789823299
Test error: 0.03456051025763811
Number of epochs with lowest validation: 198
Train error: 0.038020750782153924
Test error: 0.03456082381962365
Number

Number of epochs with lowest validation: 198
Train error: 0.03802293662659597
Test error: 0.03456801915690489
Number of epochs with lowest validation: 198
Train error: 0.03802206832847692
Test error: 0.03456738812017012
Number of epochs with lowest validation: 198
Train error: 0.03802490755072545
Test error: 0.034569763677092656
Number of epochs with lowest validation: 198
Train error: 0.03802916757668482
Test error: 0.03457329699366623
Number of epochs with lowest validation: 198
Train error: 0.038028848206695456
Test error: 0.03457307780162258
Number of epochs with lowest validation: 198
Train error: 0.0380265802649161
Test error: 0.03457121360929822
Number of epochs with lowest validation: 198
Train error: 0.03802336525910101
Test error: 0.03456861242847488
Number of epochs with lowest validation: 198
Train error: 0.03802446351362123
Test error: 0.034569584956161775
Number of epochs with lowest validation: 198
Train error: 0.03802724108070836
Test error: 0.034571885106819154
Number 

In [9]:
prediction = res.predict(test_feat)
show_correlation(prediction, test_ranking)

<IPython.core.display.Javascript object>

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 100) + inhomogeneous part.

#### Looking at the two graphs provided above, it is clear that removing GRE from the list of features does not affect how accurate the predicition is for the chance of admission. This implies that the weights associated with these features are not large enough to affect the true prediction. Thus, it can be assumed that GRE is not an important feature that needs to be taken into account to determine the chance of admission

## 2.Logistic regression using a simple perceptron.

### For whether your status is sufficient to make it off the Titanic, download the Titanic data set from Files.

#### a) Process the dataset. What changes from problem 1? Filter out data with missing features. Use one hot encoders to transform the categorical features and the output survival status. 


#### In this data set, there are some missing features, as noted in the question. Additionally, there are categorical values which were not present in the previous question. I removed some of the features such as the name, ticket, cabin, and passenger id information, which should ideally not affected the prediction in the first place. 

In [11]:
### Read in the titanic dataset ###
titanic=pd.read_csv('titantic.csv')
### Filter out data with missing values 
titanic=titanic.dropna()
### remove name, ticket, cabin, and passenger id
titanic=titanic.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1,S
6,0,1,male,54.0,0,0,51.8625,S
10,1,3,female,4.0,1,1,16.7,S
11,1,1,female,58.0,0,0,26.55,S


In [13]:
## Encode the categorical data Pclass, sex, sibSp, parch, embarked
categorical_feats = titanic[['Pclass','Sex', 'SibSp', 'Parch', 'Embarked']]
continuous_feats = titanic[['Age', 'Fare']]
encoder = OneHotEncoder() 
encoder.fit(categorical_feats)
categorical_feats = encoder.transform(categorical_feats).to_numpy()

survived = np.array(titanic['Survived']).reshape(-1,1)
res_encoder = OneHotEncoder()
survived = res_encoder.fit_transform(survived)
survived = res_encoder.inverse_transform(survived).to_numpy()

### Stack together 
feats = np.hstack((continuous_feats, categorical_feats)) #[  ] [] = [   ]

####  b) Use the simple perceptome model we developed in Q1. Use 80% of the data for training and 20% of the data for testing and do 5-fold validation. Can we predict who will survive? Play around with the features to determine which ones give you a better chance to get back to shore.

In [14]:
train_feat,test_feat,train_ranking,test_ranking=skl_model.train_test_split(feats, survived, test_size = 0.20)
res = Kfold(k=5, Xs=train_feat, ys=train_ranking, epochs=1000)

In [15]:
prediction = res.predict(test_feat)
show_correlation(prediction, test_ranking)

## 3. Nonlinear regression using a simple perceptron and a simple ANN.

### Let’s try to fit to a simple sine function y = 3 sin(x) + 5 with the simple perceptron and a simple fully connected network. 

#### a) Use the generate_data() function provided in the reference file to generate the training data (5000 points). Do 5-fold cross validation with the simple perceptron model. How well is the prediction? Generate 1000 new points as your test data, and use the show_correlation() function to see how well your model agree with the test data

In [None]:
def generate_X(number):
    xs=(np.random.random(number)*2-1)*10
    return xs
    
def generate_data(number,stochascity=0.05):
    xs=generate_X(number)
    fs=3*np.sin(xs)-5
    stochastic_ratio=(np.random.random(number)*2-1)*stochascity+1
    return xs,fs*stochastic_ratio

In [None]:
x,y=generate_data(5000,0.1)
x = x.reshape(-1,1)
y = y.reshape(-1,1)
plt.scatter(x,y,s=0.1)

In [None]:
## Test using the simple perceptron of 5 fold cross validation 
train_feat,test_feat,train_ranking,test_ranking=skl_model.train_test_split(x, y, test_size = 0.20)
res = Kfold(k=5, Xs=x, ys=y, epochs=100)

In [None]:
## Generate another 1000 points 
x_new, y_new = generate_data(1000)
prediction = res.predict(x_new)

In [None]:
## use show_correlation() function 
show_correlation(prediction, y_new)

#### b) Use the multilayer perceptron regressor of scikit-learn as a simple fully connected neural network. Use one hidden layer with 8 neurons. Do 5-fold cross validation with this simple ANN, and report the MSE on each fold. Visualize the correlation of your model prediction and the true test data.  Is the result better than a simple perceptron?

In [None]:
### Use mlp regressor to fit the data
from sklearn.neural_network import MLPRegressor

def KFold_NN(k,Xs,ys,hidden_layers,epochs=1000,lr=0.001,):
    # The total number of examples for training the network
    total_num=len(Xs)

    # Built in K-fold function in Sci-Kit Learn
    kf=KFold(n_splits=k,shuffle=True)
    train_error_all=[]
    test_error_all=[]
    for train_selector,test_selector in kf.split(range(total_num)):
        # Decide training examples and testing examples for this fold
        train_Xs=Xs[train_selector]
        test_Xs=Xs[test_selector]
        train_ys=ys[train_selector]
        test_ys=ys[test_selector]
        
        # Establish the model here
        model = MLPRegressor(max_iter=epochs, activation='tanh', early_stopping=True, 
                             validation_fraction=0.25, learning_rate='constant', learning_rate_init=lr,
                             hidden_layer_sizes=hidden_layers).fit(train_Xs, train_ys)
        
        ### Report result for this fold ##
        train_error=model.score(train_Xs,train_ys)
        train_error_all.append(train_error)
        test_error=model.score(test_Xs,test_ys)
        test_error_all.append(test_error)
        print("Train error:",train_error)
        print("Test error:",test_error)

    print("Final results:")
    print("Training error:%f+-%f"%(np.average(train_error_all),np.std(train_error_all)))
    print("Testing error:%f+-%f"%(np.average(test_error_all),np.std(test_error_all)))
    
    # return the last model
    return model

In [None]:
### Use KFold_NN to fit the data
model=KFold_NN(5,x,y,(8,),epochs=1000,lr=0.001)

#### c) Play with the architecture for the ANN, can you improve the performance by including additional hidden layers?

In [None]:
### Use KFold_NN to fit the data
model=KFold_NN(5,x,y,(8,10),epochs=1000,lr=0.001)