# Importing the required libraries

In [225]:
import numpy  as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing Dataset

In [226]:
dataset = pd.read_csv('student-mat.csv' , sep = ';')

# Pre-processing the data

### Analysing info about all the columns

In [227]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

### Checking type of informtion in the columns 

In [228]:
dataset.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### Checking for missing data

In [229]:
dataset.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

### No missing data found here

## Now, Encoding all the binary columns with Label Encoder

In [230]:
from sklearn.preprocessing import LabelEncoder
categ = ['school' , 'sex' , 'address','famsize','Pstatus' ,'schoolsup','famsup','paid','activities' ,'nursery','higher','internet', 'romantic'] 
le = LabelEncoder()
dataset[categ] = dataset[categ].apply(le.fit_transform)

## Checking for non-integer columns

In [231]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    int32 
 1   sex         395 non-null    int32 
 2   age         395 non-null    int64 
 3   address     395 non-null    int32 
 4   famsize     395 non-null    int32 
 5   Pstatus     395 non-null    int32 
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    int32 
 16  famsup      395 non-null    int32 
 17  paid        395 non-null    int32 
 18  activities  395 non-null    int32 
 19  nursery     395 non-null    int32 
 20  higher    

### Dropping some features

In [232]:
dataset.drop(['Fjob','reason' , 'Mjob' , 'guardian'] , axis =1, inplace = True)

In [233]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      395 non-null    int32
 1   sex         395 non-null    int32
 2   age         395 non-null    int64
 3   address     395 non-null    int32
 4   famsize     395 non-null    int32
 5   Pstatus     395 non-null    int32
 6   Medu        395 non-null    int64
 7   Fedu        395 non-null    int64
 8   traveltime  395 non-null    int64
 9   studytime   395 non-null    int64
 10  failures    395 non-null    int64
 11  schoolsup   395 non-null    int32
 12  famsup      395 non-null    int32
 13  paid        395 non-null    int32
 14  activities  395 non-null    int32
 15  nursery     395 non-null    int32
 16  higher      395 non-null    int32
 17  internet    395 non-null    int32
 18  romantic    395 non-null    int32
 19  famrel      395 non-null    int64
 20  freetime    395 non-null    int6

## defining dependent and independent variables

In [234]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

## split into training and test data

In [235]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X , y , test_size = 0.2, random_state = 1)

# 1. Random Forest Regression

## Training the model

In [236]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(n_estimators = 10, random_state = 0)
random_forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

## predicting the results

In [237]:
y_pred = random_forest_reg.predict(X_test)

## Evaluating Model Performance

In [238]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8661297845908633

# 2. Multiple Linear Regression

## Training the model

In [239]:
from sklearn.linear_model import LinearRegression
mul_lin_reg = LinearRegression()
mul_lin_reg.fit(X_train, y_train)

LinearRegression()

## Predicting the result

In [240]:
y_pred = mul_lin_reg.predict(X_test)

## Evaluating Model Performance

In [241]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7354489748878934

# 3. Polynomial Regression

## Training Model

In [242]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg_features = PolynomialFeatures(degree = 4)
X_poly = poly_reg_features.fit_transform(X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)

LinearRegression()

## Predicting the result

In [243]:
y_pred = poly_reg.predict(poly_reg_features.transform(X_test))

## Evaluating Model Performance

In [244]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-2.1680769138432177

# 4. Support Vector Regression

## Feature Scaling

## Training the model

## Predicting the result

## Evaluating Model Performance

# 5. Decesion Tree Regression

## Training the model

In [245]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

## Predicting the result

In [246]:
y_pred = regressor.predict(X_test)

## Evaluating Model Performance

In [247]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7037245388275556