In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [41]:
df = pd.read_csv('data.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df

Unnamed: 0,StudyHours,ExtraParticipation,PapersPracticed,PreviousMarks,SleepingHours,PerformancePapersPracticedIndex
0,5.0,No,2.0,69.0,8,5.0
1,2.0,Yes,8.0,46.0,4,20.0
2,7.0,Yes,5.0,56.0,7,46.0
3,6.0,Yes,5.0,42.0,8,28.0
4,7.0,No,6.0,53.0,4,4.0
...,...,...,...,...,...,...
9993,5.0,Yes,3.0,90.0,8,78.0
9994,3.0,No,4.0,5.0,4,29.0
9996,4.0,No,3.0,68.0,9,5.0
9997,9.0,No,6.0,48.0,7,44.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7648 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   StudyHours                       7648 non-null   float64
 1   ExtraParticipation               7648 non-null   object 
 2   PapersPracticed                  7648 non-null   float64
 3   PreviousMarks                    7648 non-null   float64
 4   SleepingHours                    7648 non-null   int64  
 5   PerformancePapersPracticedIndex  7648 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 418.2+ KB


In [43]:
df.describe()

Unnamed: 0,StudyHours,PapersPracticed,PreviousMarks,SleepingHours,PerformancePapersPracticedIndex
count,7648.0,7648.0,7648.0,7648.0,7648.0
mean,5.519744,4.981956,64.049555,6.523536,51.868593
std,2.284837,2.750073,24.906894,1.700536,23.908146
min,2.0,0.0,4.0,4.0,0.0
25%,4.0,3.0,49.0,5.0,37.0
50%,6.0,5.0,67.0,7.0,54.0
75%,7.0,7.0,84.0,8.0,70.0
max,9.0,9.0,99.0,9.0,99.0


In [44]:
import plotly.express as ex
import plotly.io as pio
pio.renderers.default = 'browser'
ex.scatter_3d(df, x='StudyHours', y="PapersPracticed", z="PerformancePapersPracticedIndex")

In [45]:
x = df.iloc[:, 0:3:2]
y = df.iloc[:, -1]
y

0        5.0
1       20.0
2       46.0
3       28.0
4        4.0
        ... 
9993    78.0
9994    29.0
9996     5.0
9997    44.0
9999    24.0
Name: PerformancePapersPracticedIndex, Length: 7648, dtype: float64

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [47]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6118 entries, 1983 to 9512
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   StudyHours       6118 non-null   float64
 1   PapersPracticed  6118 non-null   float64
dtypes: float64(2)
memory usage: 143.4 KB


In [48]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [49]:
y_pred = model.predict(x_test)

In [50]:
df.head(4)

Unnamed: 0,StudyHours,ExtraParticipation,PapersPracticed,PreviousMarks,SleepingHours,PerformancePapersPracticedIndex
0,5.0,No,2.0,69.0,8,5.0
1,2.0,Yes,8.0,46.0,4,20.0
2,7.0,Yes,5.0,56.0,7,46.0
3,6.0,Yes,5.0,42.0,8,28.0


In [51]:
b0 = model.intercept_
b = model.coef_
b1 = b0
y = (b[0] * 7) + (b[1] * 5) + b0

In [52]:
y

np.float64(56.03323782024506)

In [53]:
import plotly.graph_objects as go
x_values = np.linspace(
    df['StudyHours'].min(),
    df['StudyHours'].max(),
    10
)
y_values = np.linspace(
    df['PapersPracticed'].min(),
    df['PapersPracticed'].max(),
    10
)
xGrid, yGrid = np.meshgrid(x_values, y_values)

final = pd.DataFrame({
    'StudyHours': xGrid.ravel(),
    'PapersPracticed': yGrid.ravel()
})

z_final = model.predict(final).reshape(10, 10)
fig = ex.scatter_3d(df, x='StudyHours', y='PapersPracticed', z='PerformancePapersPracticedIndex')
fig.add_trace(go.Surface(x=x_values, y=y_values, z=z_final, opacity=0.5, colorscale='Viridis'))
fig.show()

In [54]:
from joblib import dump

dump(model, 'model.joblib')

['model.joblib']