In [35]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle

In [36]:
df = pd.read_csv('/home/jeovine/Desktop/mlpracs/student-mat.csv', sep=";")

In [37]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### Trimming Our Data
* Since we have so many attributes and not all are relevant we need to select the ones we want to use.
* To only select data we're interested in we do this by typing the following:



In [38]:
data = df[["G1", "G2", "G3", "studytime", "failures", "absences"]]

In [39]:
data.head(5)

Unnamed: 0,G1,G2,G3,studytime,failures,absences
0,5,6,6,2,0,6
1,5,5,6,2,0,4
2,7,8,10,2,3,10
3,15,14,15,3,0,2
4,6,10,10,2,0,4


#### data - dataframe has only the data we are interested in.

## Separating Our Data
* Now that we've trimmed our data set down we need to separate it into 4 arrays.
* However, before we can do that we need to define what attribute we are trying to predict. 
     * This attribute is known as a label. 
     * The other attributes that will determine our label are known as features.
     * Once we've done this we will use numpy to create two arrays.
     * One that contains all of our features and one that contains our labels.

In [40]:
predict ="G3"

X = np.array(data.drop([predict], 1)) # Features

y = np.array(data[predict]) # Labels


* We then need to split our data into testing and training data.
* We will use 90% of our data to train and the other 10% to test.
* The reason we do this is so that we do not test our model on data that it has already seen.

In [41]:
import sklearn.model_selection


In [42]:
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.1)

### Implementing the Algorithm

* Now that we understand how linear regression works we can use it to predict students final grades.
* We will start by defining the model which we will be using.

In [43]:
linear = linear_model.LinearRegression()


* Next we will train and score our model using the arrays we created earlier.



In [44]:
linear.fit(x_train, y_train)
acc = linear.score(x_test, y_test) # acc stands for accuracy

##### To see how model performed we print acc

In [45]:
print(acc)

0.7894607799268462


In [46]:
print('Coefficient: \n', linear.coef_) # These are each slope value
print('Intercept: \n', linear.intercept_) # This is the intercept

Coefficient: 
 [ 0.15014383  0.9522988  -0.14494891 -0.37870961  0.03014207]
Intercept: 
 -1.1051455437144373


##### Predicting on Specific Students
* see how well our algorithm works on specific students. 
* To do this we are going to print out all of our test data. 
* Beside this data we will print the actual final grade and our models predicted grade


In [47]:
predictions = linear.predict(x_test) # Gets a list of all predictions

for x in range(len(predictions)):
    print(predictions[x], x_test[x], y_test[x])

11.894552371212933 [12 12  2  0  2] 11
8.96815140604682 [10 10  3  2  8] 10
8.473708708977568 [ 8  8  1  0 30] 8
9.98995476271041 [12 10  2  0  2] 11
12.786567035414675 [12 13  2  0  0] 14
8.762881972684118 [8 9 1 0 8] 10
12.40007307336216 [14 12  1  0  4] 11
10.14179815707404 [10 10  2  0 17] 10
7.424497702142837 [8 8 2 0 0] 0
3.7436551199331225 [6 5 3 1 0] 0
-0.04894382577532008 [8 0 1 0 0] 0
15.146791049840791 [16 15  3  0  0] 15
-0.49418038787502616 [6 0 2 0 0] 0
8.38718635021798 [10  9  4  0  0] 0
7.311305280977596 [8 8 1 1 4] 8
18.659351992747688 [19 18  2  0  2] 18
6.592767177990614 [8 7 2 0 4] 6
12.377233987371586 [10 13  4  0  6] 13
11.329314051134105 [ 9 12  3  0  3] 11
12.696707347460313 [11 13  2  0  2] 14
9.4496635907444 [ 8 10  2  0  4] 9
6.062457354085986 [ 8  6  2  0 18] 7
16.15417907222963 [15 16  2  0  2] 16
19.84759243528508 [18 19  1  0 10] 19
7.424497702142837 [8 8 2 0 0] 0
12.696707347460313 [11 13  2  0  2] 13
12.369931003337399 [14 12  1  0  3] 12
2.852688348513

#### Saving Our Model
* To save our model we will write to a new file using pickle.dump().
* Allows us to save our model's file with a dotpkl extension i.e .pkl

In [48]:
with open("studentgrades.pickle", "wb") as f:
    pickle.dump(linear, f)

# linear is the name of the model we created in the last tutorial
# it should be defined above this

##### Loading Our Model
* Once we've saved our model we can load it in using the following two lines. 
* Now you can remove the code that creates and trains our model,
* as we are simply loading in an existing one from our pickle file.



In [49]:
pickle_in = open("studentgrades.pickle", "rb")
linear = pickle.load(pickle_in)

# Now we can use linear to predict grades like before

In [57]:
linear

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Training Multiple Models
* You may have noticed that our models vary in accuracy.
* This is because when we split the data into training and testing data it is divided differently each time.
* Since our model trains very quickly it may be worth training multiple models and saving the best one. 
* We can do this in the following way.
##### TRAIN MODEL MULTIPLE TIMES FOR BEST SCORE



In [55]:
best = 0
for _ in range(20):
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)

    linear = linear_model.LinearRegression()

    linear.fit(x_train, y_train)
    acc = linear.score(x_test, y_test)
    print("Accuracy: " + str(acc))
# If the current model has a better score than one we've already trained then save it
    if acc > best:
        best = acc
        with open("studentgrades.pickle", "wb") as f:
            pickle.dump(linear, f)

Accuracy: 0.7303012436466705
Accuracy: 0.8084353378531904
Accuracy: 0.7988347092514647
Accuracy: 0.8851251557095744
Accuracy: 0.8892708525054178
Accuracy: 0.8108603279682587
Accuracy: 0.756689908391868
Accuracy: 0.834072830924416
Accuracy: 0.7467393773912712
Accuracy: 0.8573567820435871
Accuracy: 0.8261230521478126
Accuracy: 0.8195471266497749
Accuracy: 0.8445538717849708
Accuracy: 0.7564041008735334
Accuracy: 0.8413264846983395
Accuracy: 0.8198179333270301
Accuracy: 0.8931055061593801
Accuracy: 0.7850485071211293
Accuracy: 0.8566384850760942
Accuracy: 0.7971115065650647
