# Step 0: Import the necessary libraries

In [22]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import linear_model
import numpy as np
%matplotlib inline

# Step 1: Import Data

In [6]:
try:
    file = pd.read_csv("student-por.csv")
except FileNotFoundError:
    print("File Not Found")

print(file.columns)
file.head(5)

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


# Step 2: Clean the Data

In [7]:
n_file = file.drop(columns=['school', 'reason', 'traveltime', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences'] , axis=1)
n_file.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,guardian,studytime,failures,G1,G2,G3
0,F,18,U,GT3,A,4,4,at_home,teacher,mother,2,0,0,11,11
1,F,17,U,GT3,T,1,1,at_home,other,father,2,0,9,11,11
2,F,15,U,LE3,T,1,1,at_home,other,mother,2,0,12,13,12
3,F,15,U,GT3,T,4,2,health,services,mother,3,0,14,14,14
4,F,16,U,GT3,T,3,3,other,other,father,2,0,11,13,13


In [8]:
for col in ['sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'guardian']:
    n_file[col] = pd.factorize(n_file[col])[0]
    
n_file.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,guardian,studytime,failures,G1,G2,G3
0,0,18,0,0,0,4,4,0,0,0,2,0,0,11,11
1,0,17,0,0,1,1,1,0,1,1,2,0,9,11,11
2,0,15,0,1,1,1,1,0,1,0,2,0,12,13,12
3,0,15,0,0,1,4,2,1,2,0,3,0,14,14,14
4,0,16,0,0,1,3,3,2,1,1,2,0,11,13,13


# Step 3: Split the Data into Training/testing

In [9]:
X = np.asanyarray(n_file[['sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob',
       'Fjob', 'guardian', 'studytime', 'failures', 'G1', 'G2']])
y = np.asanyarray(n_file[["G3"]])

In [19]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=38)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (519, 14) (519, 1)
Test set: (130, 14) (130, 1)


# Step 4: Create a Model and Step 5: Train the Model

In [26]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
# The coefficients
print ('Coefficients: ', regr.coef_)

Coefficients:  [[-0.28799389  0.05005622 -0.1235316   0.04362577 -0.16709091 -0.01649631
   0.02778212  0.02609701  0.06400286  0.0234237   0.08484154 -0.29821521
   0.13893872  0.86268533]]


# Step 6: Make Predictions

In [30]:
y_hat= regr.predict(X_test)

# Step 7: Evaluation and Improve

In [44]:
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y_test) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

Residual sum of squares: 1.66
Variance score: 0.86
