In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn import preprocessing, svm 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 

In [2]:
df=pd.read_csv('Student_Performance.csv')
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [4]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [6]:
#train our model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#define X (features) and Y (target)
X=df.drop(columns=["Performance Index"])
Y=df["Performance Index"]
X=pd.get_dummies(X,columns=["Extracurricular Activities"],drop_first=True)
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

print("Training set shape:",X_train.shape,Y_train.shape)
print("Testing set shape:",X_test.shape,Y_test.shape)


<IPython.core.display.Javascript object>

Training set shape: (8000, 5) (8000,)
Testing set shape: (2000, 5) (2000,)


In [7]:
regr=LinearRegression()

<IPython.core.display.Javascript object>

In [9]:
regr.fit(X_train,Y_train)
print(regr.score(X_test,Y_test))

0.9889832909573145


In [10]:
train_score = regr.score(X_train, Y_train)
test_score = regr.score(X_test, Y_test)

print(f"Training Score: {train_score}")
print(f"Testing Score: {test_score}")


Training Score: 0.9886898790682355
Testing Score: 0.9889832909573145


In [11]:
# feature importance analysis
feature_importance = pd.Series(regr.coef_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))


<IPython.core.display.Javascript object>

Hours Studied                       2.852484
Previous Scores                     1.016988
Extracurricular Activities_Yes      0.608617
Sleep Hours                         0.476941
Sample Question Papers Practiced    0.191831
dtype: float64


In [12]:
#so sample  question and sleep hours are less participating we are dropping them
X_train = X_train.drop(columns=["Sample Question Papers Practiced"])
X_test = X_test.drop(columns=["Sample Question Papers Practiced"])


In [13]:
# Retrain the model
regr.fit(X_train, Y_train)

# Check new model performance
train_score = regr.score(X_train, Y_train)
test_score = regr.score(X_test, Y_test)

print(f"New Training Score: {train_score}")
print(f"New Testing Score: {test_score}")


New Training Score: 0.9878730826973215
New Testing Score: 0.9880663370556699


In [14]:
#hypertuning model 
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {"alpha": [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Ridge regression
ridge = Ridge()

# Perform Grid Search
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, Y_train)

# Print best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'alpha': 10}
Best Score: 0.9878521734145658


In [15]:
from sklearn.linear_model import Ridge

# Train final model with best alpha
ridge_final = Ridge(alpha=10)
ridge_final.fit(X_train, Y_train)

# Evaluate performance
train_score = ridge_final.score(X_train, Y_train)
test_score = ridge_final.score(X_test, Y_test)

print(f"Final Training Score: {train_score}")
print(f"Final Testing Score: {test_score}")


Final Training Score: 0.9878730710140274
Final Testing Score: 0.9880656754760474


In [17]:
def predict_performance():
    # Get user input
    hours_studied = float(input("Enter Hours Studied: "))
    previous_scores = float(input("Enter Previous Scores: "))
    extracurricular = input("Did the student participate in Extracurricular Activities? (Yes/No): ").strip().lower()
    sleep_hours = float(input("Enter Sleep Hours: "))
    
    # One-hot encode the categorical variable
    extracurricular_yes = 1 if extracurricular == "yes" else 0

    # Create input array (ensure it's in the right format)
    user_input = [[hours_studied, previous_scores, extracurricular_yes, sleep_hours]]

    # Make prediction
    predicted_performance = ridge_final.predict(user_input)

    print(f"\nPredicted Performance Index: {predicted_performance[0]:.2f}")

# Call function to test
predict_performance()


Enter Hours Studied:  5
Enter Previous Scores:  56
Did the student participate in Extracurricular Activities? (Yes/No):  Yes
Enter Sleep Hours:  10



Predicted Performance Index: 44.70




In [1]:
import os
print(os.getcwd())  # This prints the folder path where your file is saved


C:\Users\Khushi Rajora


In [3]:
import os
print(os.getcwd())  # This shows the exact folder where your file is saved


C:\Users\Khushi Rajora
