In [54]:
import pandas as pd  # Allows us to present the csv file in a readable dataset
import numpy as np  # Used to create an array of attributes and labels to analyze
import sklearn  # This has all the machine learning functionalities
from sklearn import linear_model
from sklearn.utils import shuffle
import matplotlib.pyplot as pyplot
from matplotlib import style

# All of our attributes are seperated by semi-colon as appose to the standard commas
data = pd.read_csv('student-mat.csv', delimiter=';')

In [55]:
# Will print the first 5 elements of our dataset
data.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [56]:
# If i only want to consider first the attributes which are integers
#data = data[['age', 'Medu', 'Fedu','famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]
data = data[['G1', 'G2', 'G3', 'studytime', 'failures', 'absences']]
data.head()

Unnamed: 0,G1,G2,G3,studytime,failures,absences
0,5,6,6,2,0,6
1,5,5,6,2,0,4
2,7,8,10,2,3,10
3,15,14,15,3,0,2
4,6,10,10,2,0,4


In [65]:
# We are trying to predict G3 which is the students final grade 
# Making G3 into a label as this is of interest so we remove it from the attributes during testing and training
predict = 'G3'

# Defining our attributes and labels
X = np.array(data.drop([predict], 1))  # New dataset without G3
y = np.array(data[predict])

# Splitting our data into 4 variables and using 10% dataset test size
best = 0
for _ in range(100):
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)

    # Linear regression looks at a scatter of datapoints and attempts to draw a line of best fit
    linear = linear_model.LinearRegression()

    linear.fit(x_train, y_train)  # Fit the data to find a line of best fit 
    accuracy = linear.score(x_test, y_test)  # This is going to work out the accuracy using the test values
    
    if accuracy > best:
        best = accuracy
        
best

  X = np.array(data.drop([predict], 1))  # New dataset without G3


0.9503836832756356

In [58]:
print('Coefficient:\n', linear.coef_)
print('Intercept:\n', linear.intercept_)

Coefficient:
 [ 0.16047168  0.96286537 -0.18809908 -0.32232253  0.03673617]
Intercept:
 -1.3499381194546718


In [59]:
# Checking predictions using our 
# Using x_test datasets to find our y_test prediction
predictions = linear.predict(x_test)

for i in range(len(predictions)):
    print(predictions[i], x_test[i], y_test[i])

14.963447815540556 [14 15  2  0  0] 15
7.334032477677356 [8 8 2 0 2] 8
16.421828623073374 [15 16  1  0  4] 15
12.661518917585033 [14 13  4  0  0] 14
14.262153857492814 [14 14  1  0  2] 13
9.521334641439937 [ 8 10  1  0  4] 10
13.72217731056053 [13 13  2  0 23] 13
12.716773713949069 [12 13  2  0  0] 13
14.000582441603784 [14 14  2  0  0] 14
10.716764852086412 [11 11  1  1  6] 10
11.566920767900848 [12 11  1  0 16] 11
15.651752873498713 [16 15  2  0 10] 15
8.168170714402054 [10  9  4  0  0] 0
15.19739183324757 [15 15  2  0  2] 16
9.65417892011689 [10 10  2  0  4] 10
12.790246054797112 [12 13  2  0  2] 13
12.83720250039268 [13 12  1  0 20] 12
10.864515306923561 [12 11  2  0  2] 12
10.952088050790737 [10 11  1  0  8] 10
7.554449500221484 [8 8 2 0 8] 6
15.684070821001852 [16 15  3  0 16] 15
14.165744667783876 [16 14  4  0  6] 15
-0.7909336065648435 [7 0 3 0 0] 0
7.444556981661323 [8 9 1 3 0] 10
8.062953833907118 [7 9 2 0 0] 8
5.3624567871377815 [7 6 1 0 0] 0
11.057304931285671 [13 11  3  0 