In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics  import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV


## Import the data

In [2]:
df = pd.read_csv('stud.csv')

In [3]:
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [4]:
X = df.drop(columns=['math_score'],axis=1)
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
print("Categories in gender Variable: ",end=' ')
print(df['gender'].unique())

print("Categories in race_ethnicity variable: ",end=' ')
print(df['race_ethnicity'].unique())

print("Categories in parental_level_of_education ",end=" ")
print(df['parental_level_of_education'].unique())

print("categories in lunch : ",end=" ")
print(df['lunch'].unique())

print("categories in test_preparation_course :",end=' ')
print(df['test_preparation_course'].unique())


Categories in gender Variable:  ['female' 'male']
Categories in race_ethnicity variable:  ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in parental_level_of_education  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
categories in lunch :  ['standard' 'free/reduced']
categories in test_preparation_course : ['none' 'completed']


In [6]:
y = df['math_score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

### seprate out the columns with there data types 

In [7]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns


In [8]:
num_features

Index(['reading_score', 'writing_score'], dtype='object')

In [9]:
cat_features

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')

In [10]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_Transformer = StandardScaler()
ob_transformer = OneHotEncoder()

preprocessor = ColumnTransformer([
    ("OneHotEncoder",ob_transformer,cat_features),
    ("StandardScaler",numeric_Transformer,num_features)
])

In [11]:
X = preprocessor.fit_transform(X)


In [12]:
X.shape

(1000, 19)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)



### Create an Evaluate Fucation to give all metrics after model Training

In [14]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return(mae,mse,rmse,r2_square)

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decsion Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    'AdaBoost Regressor':AdaBoostRegressor()
}

Model_list =[]
r2_list =[]
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    # Make Predications
    y_pred = model.predict(X_test)
    
    test_mae,test_mse,test_rmse,test_r2_square=evaluate_model(y_test,y_pred)
    
    
    print(list(models.keys())[i])
    print('Model performance for Test set')
    print("- Mean absolute Error: {:.4f}".format(test_mae))
    print("- Mean squared Error: {:.4f}".format(test_mse))
    print("- Root mean Squared Error: {:.4f}".format(test_rmse))
    print("- R2 score: {:.4f}".format(test_r2_square))
    print(30*"-")
    


Linear Regression
Model performance for Test set
- Mean absolute Error: 4.2169
- Mean squared Error: 29.1319
- Root mean Squared Error: 5.3974
- R2 score: 0.8803
------------------------------
Lasso
Model performance for Test set
- Mean absolute Error: 5.1579
- Mean squared Error: 42.5064
- Root mean Squared Error: 6.5197
- R2 score: 0.8253
------------------------------
Ridge
Model performance for Test set
- Mean absolute Error: 4.2111
- Mean squared Error: 29.0563
- Root mean Squared Error: 5.3904
- R2 score: 0.8806
------------------------------
K-Neighbors Regressor
Model performance for Test set
- Mean absolute Error: 5.6780
- Mean squared Error: 53.3776
- Root mean Squared Error: 7.3060
- R2 score: 0.7806
------------------------------
Decsion Tree
Model performance for Test set
- Mean absolute Error: 6.2000
- Mean squared Error: 61.8900
- Root mean Squared Error: 7.8670
- R2 score: 0.7457
------------------------------
Random Forest Regressor
Model performance for Test set
- Mea

In [17]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    # Make Predications
    y_pred = model.predict(X_test)
    
    test_mae,test_mse,test_rmse,test_r2_square=evaluate_model(y_test,y_pred)
    
    
    print(list(models.keys())[i])
    print('Model performance for Test set')
    print("- Mean absolute Error: {:.4f}".format(test_mae))
    print("- Mean squared Error: {:.4f}".format(test_mse))
    print("- Root mean Squared Error: {:.4f}".format(test_rmse))
    print("- R2 score: {:.4f}".format(test_r2_square))
    print(30*"-")
    

Linear Regression
Model performance for Test set
- Mean absolute Error: 4.2169
- Mean squared Error: 29.1319
- Root mean Squared Error: 5.3974
- R2 score: 0.8803
------------------------------
Lasso
Model performance for Test set
- Mean absolute Error: 5.1579
- Mean squared Error: 42.5064
- Root mean Squared Error: 6.5197
- R2 score: 0.8253
------------------------------
Ridge
Model performance for Test set
- Mean absolute Error: 4.2111
- Mean squared Error: 29.0563
- Root mean Squared Error: 5.3904
- R2 score: 0.8806
------------------------------
K-Neighbors Regressor
Model performance for Test set
- Mean absolute Error: 5.6780
- Mean squared Error: 53.3776
- Root mean Squared Error: 7.3060
- R2 score: 0.7806
------------------------------
Decsion Tree
Model performance for Test set
- Mean absolute Error: 6.1500
- Mean squared Error: 61.0000
- Root mean Squared Error: 7.8102
- R2 score: 0.7493
------------------------------
Random Forest Regressor
Model performance for Test set
- Mea

NameError: name 'y_train_pred' is not defined