In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder

# Read Data

In [2]:
df = pd.read_csv('test_scores.csv')
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,Urban,Public,ZBH,Standard,30.0,T8LSK,Female,Does not qualify,39.0,55.0
2129,ZOWMK,Urban,Public,ZBH,Standard,30.0,VNP26,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,ZOWMK,Urban,Public,ZBH,Standard,30.0,YDR1Z,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,ZOWMK,Urban,Public,ZBH,Standard,30.0,YUEIH,Male,Qualifies for reduced/free lunch,46.0,53.0


In [3]:
df.isnull().sum()

school             0
school_setting     0
school_type        0
classroom          0
teaching_method    0
n_student          0
student_id         0
gender             0
lunch              0
pretest            0
posttest           0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,n_student,pretest,posttest
count,2133.0,2133.0,2133.0
mean,22.796531,54.955931,67.102203
std,4.228893,13.563101,13.986789
min,14.0,22.0,32.0
25%,20.0,44.0,56.0
50%,22.0,56.0,68.0
75%,27.0,65.0,77.0
max,31.0,93.0,100.0


In [5]:
cols_drop = ['student_id']
df = df.drop(cols_drop,axis=1)
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,Urban,Public,ZBH,Standard,30.0,Female,Does not qualify,39.0,55.0
2129,ZOWMK,Urban,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,ZOWMK,Urban,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,ZOWMK,Urban,Public,ZBH,Standard,30.0,Male,Qualifies for reduced/free lunch,46.0,53.0


# Encoding Data (ex: School, teaching method, etc.)

In [6]:
df['school'].unique()

array(['ANKYI', 'CCAAW', 'CIMBB', 'CUQAM', 'DNQDD', 'FBUMG', 'GJJHK',
       'GOKXL', 'GOOBU', 'IDGFP', 'KFZMY', 'KZKKE', 'LAYPA', 'OJOBU',
       'QOQTS', 'UAGPU', 'UKPGS', 'UUUQX', 'VHDHF', 'VKWQH', 'VVTVA',
       'ZMNYA', 'ZOWMK'], dtype=object)

In [7]:
enc = OrdinalEncoder()

In [8]:
enc.fit_transform(df[['school']])

array([[ 0.],
       [ 0.],
       [ 0.],
       ...,
       [22.],
       [22.],
       [22.]])

In [9]:
df[['school']] = enc.fit_transform(df[['school']])

In [10]:
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,0.0,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,62.0,72.0
1,0.0,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,66.0,79.0
2,0.0,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
3,0.0,Urban,Non-public,6OL,Standard,20.0,Female,Does not qualify,61.0,77.0
4,0.0,Urban,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...
2128,22.0,Urban,Public,ZBH,Standard,30.0,Female,Does not qualify,39.0,55.0
2129,22.0,Urban,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,22.0,Urban,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,22.0,Urban,Public,ZBH,Standard,30.0,Male,Qualifies for reduced/free lunch,46.0,53.0


# Encoding school setting

In [11]:
df['school_setting'].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [12]:
school_set = ['Urban', 'Suburban', 'Rural']

In [13]:
enc = OrdinalEncoder(categories=[school_set])

In [14]:
enc.fit_transform(df[['school_setting']])

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [15]:
df[['school_setting']] = enc.fit_transform(df[['school_setting']])

In [16]:
df['school_setting'].unique()

array([0., 1., 2.])

In [17]:
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,0.0,0.0,Non-public,6OL,Standard,20.0,Female,Does not qualify,62.0,72.0
1,0.0,0.0,Non-public,6OL,Standard,20.0,Female,Does not qualify,66.0,79.0
2,0.0,0.0,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
3,0.0,0.0,Non-public,6OL,Standard,20.0,Female,Does not qualify,61.0,77.0
4,0.0,0.0,Non-public,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...
2128,22.0,0.0,Public,ZBH,Standard,30.0,Female,Does not qualify,39.0,55.0
2129,22.0,0.0,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,22.0,0.0,Public,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,22.0,0.0,Public,ZBH,Standard,30.0,Male,Qualifies for reduced/free lunch,46.0,53.0


# Encoding School Type

In [18]:
df['school_type'].unique()

array(['Non-public', 'Public'], dtype=object)

In [19]:
school_type = ['Non-public', 'Public']

In [20]:
enc = OrdinalEncoder(categories=[school_type])

In [21]:
enc.fit_transform(df[['school_type']])

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])

In [22]:
df[['school_type']] = enc.fit_transform(df[['school_type']])

In [23]:
df['school_type'].unique()

array([0., 1.])

In [24]:
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,0.0,0.0,0.0,6OL,Standard,20.0,Female,Does not qualify,62.0,72.0
1,0.0,0.0,0.0,6OL,Standard,20.0,Female,Does not qualify,66.0,79.0
2,0.0,0.0,0.0,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
3,0.0,0.0,0.0,6OL,Standard,20.0,Female,Does not qualify,61.0,77.0
4,0.0,0.0,0.0,6OL,Standard,20.0,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...
2128,22.0,0.0,1.0,ZBH,Standard,30.0,Female,Does not qualify,39.0,55.0
2129,22.0,0.0,1.0,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,22.0,0.0,1.0,ZBH,Standard,30.0,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,22.0,0.0,1.0,ZBH,Standard,30.0,Male,Qualifies for reduced/free lunch,46.0,53.0


# Encoding Classroom

In [25]:
df['classroom'].unique()

array(['6OL', 'ZNS', '2B1', 'EPS', 'IQN', 'PGK', 'UHU', 'UWK', 'A33',
       'EID', 'HUJ', 'PC6', '1Q1', 'BFY', 'OMI', 'X6Z', '2AP', 'PW5',
       'ROP', 'ST7', 'XXJ', '197', '5LQ', 'JGD', 'HCB', 'NOR', 'X78',
       'YUC', 'ZDT', 'ENO', 'TSA', 'VA6', '18K', 'CXC', 'HKF', 'PBA',
       'U6J', 'W8A', '05H', '98D', 'G2L', 'P2A', 'XZM', '1VD', '21Q',
       '2BR', '3D0', '5JK', 'O6A', 'QTU', 'AJ1', 'J8J', 'RA5', '5SZ',
       '6U9', 'FS3', 'XJ8', '0N7', '3XJ', 'RK7', 'SUR', 'X2O', 'XZ4',
       '1SZ', '62L', 'NWZ', 'S98', '08N', '9AW', 'IPU', 'KXB', 'PGH',
       'XXE', '6C1', 'AE1', 'H7S', 'P8I', 'SSP', 'CD8', 'J6X', 'KR1',
       '341', 'D33', 'DFQ', 'GYM', 'IEM', '7BL', 'A93', 'TB5', 'YTB',
       '1UU', '4NN', 'V77', 'CII', 'Q0E', 'QA2', 'ZBH'], dtype=object)

In [26]:
classroom = ['6OL', 'ZNS', '2B1', 'EPS', 'IQN', 'PGK', 'UHU', 'UWK', 'A33',
       'EID', 'HUJ', 'PC6', '1Q1', 'BFY', 'OMI', 'X6Z', '2AP', 'PW5',
       'ROP', 'ST7', 'XXJ', '197', '5LQ', 'JGD', 'HCB', 'NOR', 'X78',
       'YUC', 'ZDT', 'ENO', 'TSA', 'VA6', '18K', 'CXC', 'HKF', 'PBA',
       'U6J', 'W8A', '05H', '98D', 'G2L', 'P2A', 'XZM', '1VD', '21Q',
       '2BR', '3D0', '5JK', 'O6A', 'QTU', 'AJ1', 'J8J', 'RA5', '5SZ',
       '6U9', 'FS3', 'XJ8', '0N7', '3XJ', 'RK7', 'SUR', 'X2O', 'XZ4',
       '1SZ', '62L', 'NWZ', 'S98', '08N', '9AW', 'IPU', 'KXB', 'PGH',
       'XXE', '6C1', 'AE1', 'H7S', 'P8I', 'SSP', 'CD8', 'J6X', 'KR1',
       '341', 'D33', 'DFQ', 'GYM', 'IEM', '7BL', 'A93', 'TB5', 'YTB',
       '1UU', '4NN', 'V77', 'CII', 'Q0E', 'QA2', 'ZBH']

In [27]:
enc = OrdinalEncoder(categories=[classroom])

In [28]:
enc.fit_transform(df[['classroom']])

array([[ 0.],
       [ 0.],
       [ 0.],
       ...,
       [96.],
       [96.],
       [96.]])

In [29]:
df[['classroom']] = enc.fit_transform(df[['classroom']])

In [30]:
df['classroom'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
       52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64.,
       65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77.,
       78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90.,
       91., 92., 93., 94., 95., 96.])

In [31]:
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest,posttest
0,0.0,0.0,0.0,0.0,Standard,20.0,Female,Does not qualify,62.0,72.0
1,0.0,0.0,0.0,0.0,Standard,20.0,Female,Does not qualify,66.0,79.0
2,0.0,0.0,0.0,0.0,Standard,20.0,Male,Does not qualify,64.0,76.0
3,0.0,0.0,0.0,0.0,Standard,20.0,Female,Does not qualify,61.0,77.0
4,0.0,0.0,0.0,0.0,Standard,20.0,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...
2128,22.0,0.0,1.0,96.0,Standard,30.0,Female,Does not qualify,39.0,55.0
2129,22.0,0.0,1.0,96.0,Standard,30.0,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,22.0,0.0,1.0,96.0,Standard,30.0,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,22.0,0.0,1.0,96.0,Standard,30.0,Male,Qualifies for reduced/free lunch,46.0,53.0


# Encoding teaching method

In [32]:
df['teaching_method'].unique()

array(['Standard', 'Experimental'], dtype=object)

In [33]:
teach_method = ['Standard', 'Experimental']

In [34]:
enc = OrdinalEncoder(categories=[teach_method])

In [35]:
enc.fit_transform(df[['teaching_method']])

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [36]:
df[['teaching_method']] = enc.fit_transform(df[['teaching_method']])

In [37]:
df['teaching_method'].unique()

array([0., 1.])

# Encoding Gender

In [38]:
df['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [39]:
gender = ['Female', 'Male']

In [40]:
enc = OrdinalEncoder(categories=[gender])

In [41]:
df[['gender']] = enc.fit_transform(df[['gender']])

In [42]:
df['gender'].unique()

array([0., 1.])

# Encoding lunch

In [43]:
df['lunch'].unique()

array(['Does not qualify', 'Qualifies for reduced/free lunch'],
      dtype=object)

In [44]:
lunch = ['Does not qualify', 'Qualifies for reduced/free lunch']

In [45]:
enc =OrdinalEncoder(categories=[lunch])

In [46]:
df[['lunch']] = enc.fit_transform(df[['lunch']])

In [47]:
df['lunch'].unique()

array([0., 1.])

# Build X and Y Variable

Build X for pretest and X for school,lunch,etc.

In [48]:
X1 = df[['pretest']]
Y1 = df['posttest']
X2 = df.drop(['posttest'],axis=1)
Y2 = df['posttest']

In [49]:
X1

Unnamed: 0,pretest
0,62.0
1,66.0
2,64.0
3,61.0
4,64.0
...,...
2128,39.0
2129,38.0
2130,45.0
2131,46.0


In [50]:
Y1

0       72.0
1       79.0
2       76.0
3       77.0
4       76.0
        ... 
2128    55.0
2129    46.0
2130    51.0
2131    53.0
2132    48.0
Name: posttest, Length: 2133, dtype: float64

# Linear Regression (X1,Y1) and Prediction 

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X1, Y1, test_size=0.2, random_state = 0)

In [52]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [53]:
lin_reg.fit(X_train,Y_train)

In [54]:
y_pred = lin_reg.predict(X_test)

In [55]:
lin_reg.score(X_test,Y_test)

0.9001015446462145

In [56]:
y_pred2 = lin_reg.predict(X_train)

In [57]:
df_preds = pd.DataFrame({'Actual': Y_test.squeeze(), 'Predicted': y_pred.squeeze()})
df_preds

Unnamed: 0,Actual,Predicted
1968,79.0,74.062412
379,74.0,70.130995
689,56.0,62.268160
935,59.0,51.456763
252,57.0,65.216723
...,...,...
666,57.0,55.388180
459,78.0,77.010975
596,66.0,68.165286
376,59.0,56.371034


In [58]:
lin_reg.predict([[62]])



array([74.06241181])

In [59]:
lin_reg.score(X_train,Y_train)

0.9051434691866311

# Multiple Linear Regression (X2,Y2) and Prediction

In [60]:
X_train, X_test,Y_train, Y_test = train_test_split(X2, Y2, test_size=0.2, random_state = 0)

In [61]:
regressor = LinearRegression()

In [62]:
regressor.fit(X_train,Y_train)

In [63]:
y_pred = regressor.predict(X_test)

In [64]:
regressor.score(X_test,Y_test)

0.9410746411644441

In [65]:
regressor.predict([[0,0,0,0,0,20,0,0,62]])



array([72.34367214])

In [66]:
df_preds = pd.DataFrame({'Actual': Y_test.squeeze(), 'Predicted': y_pred.squeeze()})
df_preds

Unnamed: 0,Actual,Predicted
1968,79.0,77.740819
379,74.0,73.117243
689,56.0,59.363226
935,59.0,55.259665
252,57.0,63.513776
...,...,...
666,57.0,53.755254
459,78.0,81.565849
596,66.0,65.143512
376,59.0,60.858552


# Decision Tree Regression

In [67]:
X_train, X_test,Y_train, Y_test = train_test_split(X2, Y2, test_size=0.2, random_state = 0)

In [68]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X2,Y2)

In [69]:
y_pred = regressor.predict(X_test)

In [70]:
regressor.score(X_test,Y_test)

0.9866150368201834

In [71]:
regressor.predict([[0,0,0,0,0,20,0,0,62]])



array([72.])

In [72]:
df_preds = pd.DataFrame({'Actual': Y_test.squeeze(), 'Predicted': y_pred.squeeze()})
df_preds

Unnamed: 0,Actual,Predicted
1968,79.0,79.0
379,74.0,76.5
689,56.0,56.5
935,59.0,56.6
252,57.0,60.5
...,...,...
666,57.0,56.5
459,78.0,78.0
596,66.0,66.0
376,59.0,58.5


# Random Forest Regression

In [73]:
X_train, X_test,Y_train, Y_test = train_test_split(X2, Y2, test_size=0.2, random_state = 0)

In [74]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X2,Y2)

In [75]:
y_pred = regressor.predict(X_test)

In [76]:
regressor.score(X_test,Y_test)

0.9795581846580844

In [77]:
regressor.predict([[0,0,0,0,0,20,0,0,62]])



array([72.13333333])

In [78]:
df_preds = pd.DataFrame({'Actual': Y_test.squeeze(), 'Predicted': y_pred.squeeze()})
df_preds

Unnamed: 0,Actual,Predicted
1968,79.0,79.325000
379,74.0,76.500000
689,56.0,56.668333
935,59.0,56.462222
252,57.0,59.508333
...,...,...
666,57.0,54.983333
459,78.0,78.133333
596,66.0,65.650000
376,59.0,58.986667
