# **MACHINE LEARNING: LINEAR REGRESSION**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

**`FEATURE ENGINEERING AND EXPLORATORY DATA ANALYSIS (EDA)`**

In [None]:
#Reading the csv file
stu_por = pd.read_csv(r"C:\Users\isaac\aidi1002\LAB3\student_por.csv")
stu_por.head()

In [None]:
stu_por.shape

In [None]:
#Preprocessing
#Converting strings to numeric values, encoding into binary columns using get_dummies
dummied_stupor = pd.get_dummies(stu_por, columns=['school', 'sex', 'address', 'famsize', 'Pstatus',
                                                  'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup',
                                                  'famsup', 'paid', 'activities', 'nursery', 'higher',
                                                  'internet', 'romantic'])
dummied_stupor

In [None]:
#Preprocessing
#Transforming and scaling all non-binary entries to a range of (0,1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['age', 'Medu', 'G1', 'G2', 'G3', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout',
                                                  'Dalc', 'Walc', 'health', 'absences']
dummied_stupor[num_vars] = scaler.fit_transform(dummied_stupor[num_vars])
dummied_stupor

In [None]:
#Rearranging the columns in order to get the grades at the start, so correlation can be easily observed.
dummied_stupor = dummied_stupor[['G3', 'G1', 'G2','age','Medu', 'Fedu', 'traveltime', 'studytime',
                                 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
                                 'absences', 'school_GP', 'school_MS', 'sex_F', 'sex_M', 'address_R',
                                 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T',
                                 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services',
                                 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services',
                                 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other',
                                 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other',
                                 'schoolsup_no', 'schoolsup_yes', 'famsup_no', 'famsup_yes', 'paid_no', 
                                 'paid_yes', 'activities_no', 'activities_yes', 'nursery_no', 'nursery_yes',
                                 'higher_no', 'higher_yes', 'internet_no', 'internet_yes', 'romantic_no',
                                 'romantic_yes']]

dummied_stupor

In [None]:
#Heatmap makes it easy to identify which features are most related to the target variable
#We will plot heatmap of correlated features using the seaborn library.
plt.figure(figsize = (30, 25))
sb.heatmap(dummied_stupor.corr(), annot = True, cmap='YlGnBu')
plt.show()

In [None]:
#Feature Selection.
#Using the heatmap I went for features that have a correction rating greater than 0.1 when correlated with the final grade.
#These features will be an accurate representation of features that affect the final grade.
#These form our features and the target (y) is the final grade
X = dummied_stupor[['G1', 'G2', 'Medu', 'Fedu', 'studytime', 'school_GP', 'address_U', 'Mjob_teacher', 
                           'Fjob_teacher', 'reason_reputation', 'higher_yes', 'internet_yes']]
y = dummied_stupor.G3

print(X)
print(y)

**`TRAIN/TEST & MODELLING`**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
#Multiple linear regression is carried out using the Linear regression model
#This is used in training our model and making predictions
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

**`TESTING ACCURACY`**

In [None]:
#Testing the accuracy of our model.
#We can see that our model is quite accurate, giving low MAE and RMSE values
#And more importantly, we have a r**2 value of over 0.8
#r**2 is representative of the proportion of the variation in the dependent variable that is predictable from the independent variable.
#What this simply means is that over 80% of our output values (target) can be accurately predicted from our input values (features)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print('Mean Absolute Error:',mean_absolute_error(y_test,y_pred))
print('Root Mean Squared Error:', rmse)
print('R-squared (R2):', r2)

In [None]:
#Reading the csv file
stu_mat = pd.read_csv(r"C:\Users\isaac\aidi1002\LAB3\student_mat.csv")
stu_mat.head()

In [None]:
stu_mat.shape

In [None]:

#Preprocessing
#Converting strings to numeric values, encoding into binary columns using get_dummies
dummied_stumat = pd.get_dummies(stu_mat, columns=['school', 'sex', 'address', 'famsize', 'Pstatus',
                                                  'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup',
                                                  'famsup', 'paid', 'activities', 'nursery', 'higher',
                                                  'internet', 'romantic'])
dummied_stumat

In [None]:
#Preprocessing
#Transforming and scaling all non-binary entries to a range of (0,1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['age', 'Medu','G1', 'G2', 'G3', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout',
                                                  'Dalc', 'Walc', 'health', 'absences']
dummied_stumat[num_vars] = scaler.fit_transform(dummied_stumat[num_vars])
dummied_stumat

In [None]:
#Rearranging the columns in order to get the grades at the start, so correlation can be easily observed.
dummied_stumat = dummied_stumat[['G3', 'G1', 'G2', 'age','Medu', 'Fedu', 'traveltime', 'studytime',
                                 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
                                 'absences', 'school_GP', 'school_MS', 'sex_F', 'sex_M', 'address_R',
                                 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T',
                                 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services',
                                 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services',
                                 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other',
                                 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other',
                                 'schoolsup_no', 'schoolsup_yes', 'famsup_no', 'famsup_yes', 'paid_no', 
                                 'paid_yes', 'activities_no', 'activities_yes', 'nursery_no', 'nursery_yes',
                                 'higher_no', 'higher_yes', 'internet_no', 'internet_yes', 'romantic_no',
                                 'romantic_yes']]

dummied_stumat

In [None]:
#Heatmap makes it easy to identify which features are most related to the target variable
#We will plot heatmap of correlated features using the seaborn library.
plt.figure(figsize = (30, 25))
sb.heatmap(dummied_stumat.corr(), annot = True, cmap='YlGnBu')
plt.show()

In [None]:
#Feature Selection.
#Using the heatmap I went for features that have a correction rating greater than 0.1 when correlated with the final grade.
#These features will be an accurate representation of features that affect the final grade.
#These form our features and the target (y) is the final grade

Xm = dummied_stumat[['G1', 'G2', 'Medu', 'Fedu', 'studytime', 'school_GP', 'address_U', 'Mjob_teacher', 
                           'Fjob_teacher', 'reason_reputation', 'higher_yes', 'internet_yes']]
ym = dummied_stumat.G3

print(Xm)
print(ym)

In [None]:
from sklearn.model_selection import train_test_split
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, train_size=0.7, test_size = 0.3, random_state = 0)

In [None]:
#Multiple linear regression is carried out using the Linear regression model
#This is used in training our model and making predictions

model = LinearRegression()
model.fit(Xm_train, ym_train)
ym_pred = model.predict(Xm_test)

In [None]:
#Testing the accuracy of our model.
#We can see that our model is quite accurate, giving low MAE and RMSE values
#And more importantly, we have a r**2 value of over 0.8
#r**2 is representative of the proportion of the variation in the dependent variable that is predictable from the independent variable.
#What this simply means is that over 80% of our output values (target) can be accurately predicted from our input values (features)

rmse_m = np.sqrt(mean_squared_error(ym_test,ym_pred))
r2_m = r2_score(ym_test,ym_pred)

print('Mean Absolute Error:',mean_absolute_error(ym_test,ym_pred))
print('Root Mean Squared Error:', rmse_m)
print('R-squared (R2):', r2_m)