In [None]:
import pandas as pd
path="D:/StudentPerformance/datasets.csv"
RawData=pd.read_csv(path,delimiter=',')
RawData

In [None]:
selectedColumns=['raisedhands','VisITedResources','AnnouncementsView','Discussion','ParentAnsweringSurvey','StudentAbsenceDays','ParentschoolSatisfaction','Class']
data=RawData[selectedColumns]
data

In [None]:
import warnings
warnings.filterwarnings("ignore")
data['StudentAbsenceDays'].unique()
data.replace({'Under-7':0,'Above-7':1},inplace=True)
data.replace({'No':0,'Yes':1},inplace=True)
data.replace({'M':1,'L':0,'H':2},inplace=True)
data.replace({'Good':1,'Bad':0},inplace=True)
data.replace({'Mum':1,'Father':0},inplace=True)
data

In [None]:
import numpy as np
random_row=data.sample(n=1).iloc[0, :-1]
random_index = random_row.name
random_row = np.array(random_row).reshape(1,-1)
print(random_index)
#16

## DATA ANALYSIS

In [None]:
print(RawData.info())      

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# List of categorical columns to plot
categorical_columns = ['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID', 'SectionID', 'Topic', 'Semester', 'Relation']

# Set up a grid for subplots
num_columns = len(categorical_columns)
num_rows = (num_columns + 1) // 2  # Ensure at least 1 row, rounding up
fig, axes = plt.subplots(num_rows, 2, figsize=(10, 3 * num_rows))  # Adjust figsize for smaller graphs

# Flatten the 2D array of subplots
axes = axes.flatten()


for i, column in enumerate(categorical_columns):
    sns.countplot(x=column, hue='Class', data=RawData, palette={'H': 'lime', 'M': 'grey', 'L': 'red'}, ax=axes[i])
    axes[i].set_title(f'Relationship between {column} and Class')
    axes[i].tick_params(axis='x', rotation=45)  # Rotate x-axis labels for better visibility


plt.tight_layout(h_pad=2, w_pad=2)

plt.show()


In the columns mentioned above, we can't establish a clear relationship between them and the target variable, we will be removing all these features as they may affect the accuracy. Now let's move on to the numerical columns.

In [None]:
melt = pd.melt(RawData,id_vars='Class',value_vars=['raisedhands','VisITedResources','AnnouncementsView','Discussion'])
melt

In [None]:
sns.swarmplot(x='variable',y='value',hue='Class' , data=melt,palette={'H':'lime','M':'grey','L':'red'})
plt.ylabel('Values from zero to 100')
plt.title('High, middle and low level students')


AS YOU CAN SEE THERE'S A CLEAR ESTABLISHMENT OF A RELATION BETWEEN raisedhands,visited resources,announcements view and Discussion with the target Class.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 5))


sns.countplot(x='StudentAbsenceDays', hue='Class', data=RawData, palette={'H': 'lime', 'M': 'grey', 'L': 'red'}, ax=axes[0])
axes[0].set_title('Relationship between StudentAbsent and Class')


sns.countplot(x='ParentschoolSatisfaction', hue='Class', data=RawData, palette={'H': 'lime', 'M': 'grey', 'L': 'red'}, ax=axes[1])
axes[1].set_title('Relationship between Parent Satisfaction and Class')

sns.countplot(x='ParentAnsweringSurvey',hue='Class',data=RawData,palette={'H': 'lime', 'M': 'grey', 'L': 'red'}, ax=axes[2])
axes[2].set_title('Relationship between Parent Survey and Class')


plt.tight_layout()

plt.show()

AS you can see,StudentAbsenceDays is an important feature to select cause it is directly affecting the Class variable.

## LOGISTIC REGRESSION

In [5]:
logCopy=data.copy()
x=logCopy.iloc[:,:-1]
y=logCopy.iloc[:,-1]

In [None]:
Features=(logCopy.columns.tolist())
Features.remove('Class')
Target=logCopy['Class']
print(Target)

In [7]:
from sklearn.model_selection  import train_test_split 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=52)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
logiScore = accuracy_score(y_test,y_pred)
logiReport = classification_report(y_test,y_pred)
Logirmse=pow(mean_squared_error(y_test,y_pred),0.5)

In [None]:
print(y_pred)

In [None]:
print(f'score={logiScore}')
print(f'rmse={Logirmse}')
print(logiReport)

In [None]:
prediction=model.predict(random_row)
print(f"prediction for randomly selected row: {prediction[0]}")
actualValue=logCopy.iloc[random_index,-1]
print(f"actual value for randomly selected row: {actualValue}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Create a scatter plot
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(y_pred)), y_pred, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for LOGISTIC REGRESSION')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

## MULTIPLE LINEAR REGRESSION


In [12]:
MLRcopy=data.copy()
x=MLRcopy.iloc[:,:-1]
y=MLRcopy.iloc[:,-1]

In [None]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=52)
model=LinearRegression()
model.fit(X_train,y_train)
intercept=model.intercept_
coeffs=model.coef_
predictions = model.predict(X_test)
# Round the predictions to the nearest integer
y_pred_mlr = np.round(predictions)
mse=mean_squared_error(y_test,y_pred_mlr)
MLRrmse= pow(mse,0.5)
MLRScore = accuracy_score(y_test,y_pred)
MLRReport = classification_report(y_test,y_pred)
print(y_pred_mlr)

In [None]:
print(f'score={MLRScore}')
print(f'rmse={MLRrmse}')
print(MLRReport)

In [None]:
prediction=model.predict(random_row)
print(f"prediction for randomly selected row: {prediction[0]}")
actualValue=logCopy.iloc[random_index,-1]
print(f"actual value for randomly selected row: {actualValue}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Create a scatter plot
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(y_pred_mlr)), y_pred_mlr, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for MULTIPLE LINEAR REGRESSION')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

## POLYNOMIAL REGRESSION

In [17]:
polyCopy=data.copy()
x=MLRcopy.iloc[:,:-1]
y=MLRcopy.iloc[:,-1]

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
def PolynomialRegression(degree,x,y,ipf):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=52)
    polyFeatures=PolynomialFeatures(degree)
    x_poly=polyFeatures.fit_transform(x_train)
    x_test=polyFeatures.fit_transform(x_test)
    ipf=polyFeatures.fit_transform(ipf)
    model=LinearRegression()
    model.fit(x_poly,y_train)
    polyPredictions=model.predict(x_test)
    y_pred_poly = np.round(polyPredictions)
    Ploymse=mean_squared_error(y_test,y_pred_poly)
    Polyrmse= pow(Ploymse,0.5)
    PolyScore = accuracy_score(y_test,y_pred_poly)
    PolyReport = classification_report(y_test,y_pred_poly)
    pred=model.predict(ipf)
    print(f"prediction for randomly selected row: {pred[0]}")
    actualValue=logCopy.iloc[random_index,-1]
    print(f"actual value for randomly selected row: {actualValue}")
    print(f'rmse={Polyrmse}')
    print(f'score={PolyScore}')
    print(PolyReport)
    return y_pred_poly

In [None]:
print("USING DEGREE TWO")
deg2=PolynomialRegression(2,x,y,random_row)
print("-------------------------------")
print("USING DEGREE THREE")
deg3=PolynomialRegression(3,x,y,random_row)
print("-------------------------------")
print("USING DEGREE FOUR")
deg4=PolynomialRegression(4,x,y,random_row)
print("-------------------------------")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Create a scatter plot
plt.figure(figsize=(20,5))

plt.subplot(1,3,1)
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(deg2)), deg2, label='Predicted', marker='x',c='g',s=30)
plt.title('Actual vs. Predicted for poly regression deg2')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()

plt.subplot(1,3,2)
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(deg3)), deg3, label='Predicted', marker='x',c='g',s=30)
plt.title('Actual vs. Predicted for poly regression deg3')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()

plt.subplot(1,3,3)
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(deg3)), deg3, label='Predicted', marker='x',c='g',s=30)
plt.title('Actual vs. Predicted for poly regression deg4')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()

plt.show()

## SVM

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [22]:
scaler = StandardScaler()
scaler.fit(x)
scaled = scaler.transform(x)

In [23]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(scaled, y, test_size=0.2, random_state=52)


In [24]:
svc = SVC(kernel="linear",C=100,random_state=42,gamma=1)
svc.fit(X_train1,y_train1)
svcpred = svc.predict(X_test1)

In [None]:
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(svcpred)), svcpred, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for Support Vector (Linear)')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

In [26]:
svc1= SVC(kernel="sigmoid",C=100,random_state=42,gamma=1)
svc1.fit(X_train1,y_train1)
svcpred1 = svc1.predict(X_test1)

In [None]:
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(svcpred1)), svcpred1, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for Support Vector (Sigmoid)')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

In [28]:
svc2 = SVC(kernel="rbf",C=100,random_state=42,gamma=1)
svc2.fit(X_train1,y_train1)
svcpred2 = svc2.predict(X_test1)

In [None]:
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(svcpred2)), svcpred2, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for Support Vector (rbf)')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test1, svcpred)
print(mse)
mse1 = mean_squared_error(y_test1, svcpred1)
print(mse1)
mse2 = mean_squared_error(y_test1, svcpred2)
print(mse2)


In [None]:
Score_svm= accuracy_score(y_test1,svcpred)
Score_svm1= accuracy_score(y_test1,svcpred1)
Score_svm2= accuracy_score(y_test1,svcpred2)
print(Score_svm)
print(Score_svm1)
print(Score_svm2)


In [None]:
print('Support Vector Classifier Linear' + '\n')
print(classification_report(y_test,svcpred))
print('\n')

In [None]:
print('Support Vector Classifier Sigmoid' + '\n')
print(classification_report(y_test,svcpred1))

print('\n')

In [None]:
print('Support Vector Classifier Radial' + '\n')
print(classification_report(y_test,svcpred2))

print('\n')

In [35]:
dt = DecisionTreeClassifier(max_depth=7, min_samples_split=4, min_samples_leaf=1, random_state=1)
dt.fit(X_train1,y_train1)
dtpred = dt.predict(X_test1)


In [None]:
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(dtpred)), dtpred, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for Decision Tree Classifier ')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

In [None]:
mse_dt=mean_squared_error(y_test1,dtpred)
print(mse_dt)
acc_dt=accuracy_score(y_test1,dtpred)
print(acc_dt)

In [None]:
print('Desicion Tree Classifier' + '\n')
print(classification_report(y_test,dtpred))

print('\n')

print('Confusion matrix')
sns.heatmap(confusion_matrix(y_test,dtpred),cmap='Greens_r',annot=True,fmt='g')

In [39]:
knn = KNeighborsClassifier(n_neighbors=3,p=10)
knn.fit(X_train1,y_train1)
knnpred = knn.predict(X_test1)

In [None]:
plt.scatter(np.arange(len(y_test)), y_test, label='Actual', marker='o',c='r',s=40)
plt.scatter(np.arange(len(knnpred)), knnpred, label='Predicted', marker='x',c='g',s=30)

plt.title('Actual vs. Predicted for KNN ')
plt.xlabel('Sample Index')
plt.ylabel('Final Grade Class')
plt.legend()
plt.show()

In [None]:
mse_knn = mean_squared_error(y_test1, knnpred)
print(mse_knn)
accuracy_knn=accuracy_score(y_test1,knnpred)
print(accuracy_knn)

In [None]:
print('K Nearest Neighbours' + '\n')
print(classification_report(y_test,knnpred))

print('\n')

print('Confusion matrix')
sns.heatmap(confusion_matrix(y_test,knnpred),cmap='Oranges_r',annot=True,fmt='g')