In [1]:
import pandas as pd
df = pd.read_csv('exams.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group D,some college,standard,completed,59,70,78
1,male,group D,associate's degree,standard,none,96,93,87
2,female,group D,some college,free/reduced,none,57,76,77
3,male,group B,some college,free/reduced,none,70,70,63
4,female,group D,associate's degree,standard,none,83,85,86


In [2]:
df.shape

(1000, 8)

In [4]:
#to check for any null values
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [5]:
df.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,male,group C,some college,standard,none,77,77,71
996,male,group C,some college,standard,none,80,66,66
997,female,group A,high school,standard,completed,67,86,86
998,male,group E,high school,standard,none,80,72,62
999,male,group D,high school,standard,none,58,47,45


In [6]:
#we need to add an average column

df['avg score'] = df[['math score' , 'reading score' , 'writing score']].mean(axis=1)

In [7]:
df.describe()

Unnamed: 0,math score,reading score,writing score,avg score
count,1000.0,1000.0,1000.0,1000.0
mean,67.81,70.382,69.14,69.110667
std,15.250196,14.107413,15.025917,14.027856
min,15.0,25.0,15.0,20.0
25%,58.0,61.0,59.0,59.583333
50%,68.0,70.5,70.0,69.666667
75%,79.25,80.0,80.0,79.333333
max,100.0,100.0,100.0,100.0


In [8]:
#split the data into catorigical and numerical sections

catog = df.drop(['math score','reading score','writing score','avg score'],axis=1)
numer = df[['math score','reading score','writing score','avg score']]

In [9]:
catog

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,female,group D,some college,standard,completed
1,male,group D,associate's degree,standard,none
2,female,group D,some college,free/reduced,none
3,male,group B,some college,free/reduced,none
4,female,group D,associate's degree,standard,none
...,...,...,...,...,...
995,male,group C,some college,standard,none
996,male,group C,some college,standard,none
997,female,group A,high school,standard,completed
998,male,group E,high school,standard,none


In [10]:
numer

Unnamed: 0,math score,reading score,writing score,avg score
0,59,70,78,69.000000
1,96,93,87,92.000000
2,57,76,77,70.000000
3,70,70,63,67.666667
4,83,85,86,84.666667
...,...,...,...,...
995,77,77,71,75.000000
996,80,66,66,70.666667
997,67,86,86,79.666667
998,80,72,62,71.333333


In [11]:
#convert categorical values into numbers using lambda
df1 = catog.apply(lambda x: pd.factorize(x)[0])
df1

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,0,0,0,0,0
1,1,0,1,0,1
2,0,0,0,1,1
3,1,1,0,1,1
4,0,0,1,0,1
...,...,...,...,...,...
995,1,2,0,0,1
996,1,2,0,0,1
997,0,4,5,0,0
998,1,3,5,0,1


In [12]:
data = pd.concat([df1,numer],axis=1,ignore_index=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,0,59,70,78,69.000000
1,1,0,1,0,1,96,93,87,92.000000
2,0,0,0,1,1,57,76,77,70.000000
3,1,1,0,1,1,70,70,63,67.666667
4,0,0,1,0,1,83,85,86,84.666667
...,...,...,...,...,...,...,...,...,...
995,1,2,0,0,1,77,77,71,75.000000
996,1,2,0,0,1,80,66,66,70.666667
997,0,4,5,0,0,67,86,86,79.666667
998,1,3,5,0,1,80,72,62,71.333333


In [13]:
col_name = {0:'gender',1:'race',2:'parent education',3:"Lunch",4:"prepration tests",5:"math",6:"reading",7:'writing',8:'avg'}
data = data.rename(columns=col_name)
data

Unnamed: 0,gender,race,parent education,Lunch,prepration tests,math,reading,writing,avg
0,0,0,0,0,0,59,70,78,69.000000
1,1,0,1,0,1,96,93,87,92.000000
2,0,0,0,1,1,57,76,77,70.000000
3,1,1,0,1,1,70,70,63,67.666667
4,0,0,1,0,1,83,85,86,84.666667
...,...,...,...,...,...,...,...,...,...
995,1,2,0,0,1,77,77,71,75.000000
996,1,2,0,0,1,80,66,66,70.666667
997,0,4,5,0,0,67,86,86,79.666667
998,1,3,5,0,1,80,72,62,71.333333


In [14]:
X = data.drop(['avg'],axis=1)
y = data['avg']

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=0.2 , random_state=42)


In [16]:
lr = LinearRegression()
model = lr.fit(x_train,y_train)

In [18]:
predict = model.predict(x_test)

mse = mean_squared_error(predict,y_test)
mse

1.2381960768349002e-28

In [19]:
r2_score(predict,y_test)

1.0