In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [4]:
data = pd.read_csv('StudentsPerformance.csv')

In [5]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Splitting Into Independent and Dependent Variables (x,y)

In [7]:
x = data.drop('math score', axis = 1)

In [8]:
x.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [9]:
y = data['math score']

In [10]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

# Encode X(Independent Variables)

In [12]:
encoder = LabelEncoder()

In [13]:
x['gender'] = encoder.fit_transform(x['gender'])
x['race/ethnicity'] = encoder.fit_transform(x['race/ethnicity'])
x['parental level of education'] = encoder.fit_transform(x['parental level of education'])
x['lunch'] = encoder.fit_transform(x['lunch'])
x['test preparation course'] = encoder.fit_transform(x['test preparation course'])


In [14]:
x.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,0,1,1,1,1,72,74
1,0,2,4,1,0,90,88
2,0,1,3,1,1,95,93
3,1,0,0,0,1,57,44
4,1,2,4,1,1,78,75


# Splitting the Data Set for Training

In [16]:
x_train, x_test, y_train, y_test = train_test_split( x,y,  random_state = 0, test_size = 0.1)

# Training The Model

In [18]:
regression = LinearRegression()

In [19]:
regression.fit(x_train, y_train)

# Checking The Accuracy of Our Training

In [21]:
regression.score(x_train, y_train) * 100

86.98986785656815

# Training With Other Models To Acheive Better Accuracy

In [23]:
decision = DecisionTreeRegressor()
Rforest = RandomForestRegressor ()

In [24]:
Rforest.fit(x_train, y_train)

In [25]:
decision.fit(x_train, y_train)

In [26]:
Rforest.score(x_train, y_train)

0.9775736486996653

In [27]:
decision.score(x_train, y_train) # we have now acheived nearly 100 percent accuracy with this alorithm

0.999282359506258

# Making Predictions

In [29]:
pred = decision.predict(x_test)

In [30]:
pred

array([57., 77., 45., 80., 73., 76., 63., 50., 97., 53., 40., 56., 86.,
       90., 44., 29., 71., 86., 61., 81., 61., 46., 81., 71., 50., 70.,
       57., 52., 62., 70., 75., 98., 84., 42., 35., 88., 87., 67., 71.,
       77., 69., 61., 73., 74., 67., 53., 81., 52., 75., 53., 53., 61.,
       43., 71., 44., 61., 43., 76., 82., 49., 62., 73., 45., 85., 62.,
       66., 79., 41., 58., 54., 75., 90., 74., 58., 37., 69., 61., 63.,
       26., 73., 80., 77., 71., 94., 76., 69., 61., 79., 70., 54., 60.,
       52., 59., 53., 83., 74., 49., 57., 69., 76.])

In [31]:
pred1 = Rforest.predict(x_test)

In [32]:
pred1

array([58.6       , 74.99      , 46.78      , 70.04      , 71.03      ,
       74.17      , 63.1       , 51.07      , 92.16      , 38.34      ,
       44.42      , 56.14      , 82.11      , 83.84833333, 47.87      ,
       31.7       , 52.02      , 77.36      , 52.57      , 80.73      ,
       53.58      , 43.56      , 71.23      , 63.81      , 48.59      ,
       65.05      , 61.34      , 52.32      , 58.7       , 74.91      ,
       73.38      , 90.14      , 88.12      , 52.82      , 44.93      ,
       86.03333333, 89.8       , 61.09      , 67.83      , 67.37      ,
       71.5       , 58.78      , 74.67      , 69.75      , 71.11      ,
       53.47      , 78.51      , 53.57      , 78.1       , 54.21      ,
       43.31      , 60.75      , 45.52      , 65.2       , 42.815     ,
       62.04      , 47.76      , 67.21      , 81.23      , 54.53      ,
       71.45      , 70.3       , 46.06      , 81.56      , 68.74      ,
       69.75      , 75.95      , 46.4       , 62.68      , 54.76

In [33]:
pred2 = regression.predict(x_test)

# From the scores below it shows that the lower the training accuracy score is the higher the predictions accuracy score is. Low training accuracy score is not always a bad thing in data science.

In [35]:
regression.score(x_test,y_test) # Our Predictions are 83% accurate

0.8351059515942096

In [36]:
Rforest.score(x_test, y_test) # Our Predictions are 81% accurate

0.8227303437329561

In [37]:
decision.score(x_test, y_test) # Our Predictions are 72% accurate

0.714582957273816

In [38]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Building An Input Option For Users To Test The Model With New Data

In [40]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [None]:
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# === Build preprocessing pipeline ===
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) #cat is a random variable name, it can be anythingsc
    ],
    remainder='passthrough'  # Keep numerical features as they are
)


# === Combine preprocessor with a regression model ===
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Or your model like DecisionTreeRegressor, etc.
])

# === Separate features and target from training data ===
X = data[categorical_features + numerical_features]
y = data['math score']

# === Train the pipeline ===
pipeline.fit(X, y)

# === Now collect new user input for prediction ===
Gender = input('Enter Your Gender: ')
Race = input('Enter your race/ethnicity: ')
ParentalEducation = input('Enter Your Parental Level of Education: ')
Lunch = input('Enter Your Lunch: ')
TestPrep = input('Enter Your Test Preparation Course: ')
ReadingScore = int(input('Enter Your Reading Score: '))
WritingScore = int(input('Enter Your Writing Score: '))

# === Organize input into a single-row DataFrame ===
new_input = pd.DataFrame([{
    'gender': Gender,
    'race/ethnicity': Race,
    'parental level of education': ParentalEducation,
    'lunch': Lunch,
    'test preparation course': TestPrep,
    'reading score': ReadingScore,
    'writing score': WritingScore
}])

# === Make prediction ===
prediction = pipeline.predict(new_input)
print("Approximated Predicted Math Score:", round(prediction[0]))

if prediction >= 60:
    print('You Passed The Maths test')
if prediction < 60:
    print('You Failed The Maths Test')



In [42]:
new = [[]]

Gender = ('Enter Your Gender: ')
race = ('Enter your race/ethnicity: ')
PLE = ('Enter Your Parental Level of education: ')
Lunch = ('Enter Your Lunch: ')
TPC = ('Enter Your Test Preparation Course: ')
RS = int(('Enter Your Reading Score: '))
WS = int(('Enter Your Writing Score: '))

categorical_data = [[Gender, race, PLE, Lunch, TPC]]
numerical_data = [[RS, WS]]

encoded_categorical = encoder.transform(categorical_data)
final_input = np.hstack((encoded_categorical, numerical_data))




prediction = regression.predict(final_input)

print(f"Predicted Result: {prediction[0]}")

ValueError: invalid literal for int() with base 10: 'Enter Your Reading Score: '

In [None]:
data.head()

In [None]:
categorical_features = ['Gender', 'Race/Ethnicity', 'Parental Level of Education', 'Lunch', 'Test Preparation course']
numerical_features = ['Reading Score', 'Writing Score']
