In [None]:
%pip install numpy pandas matplotlib scikit-learn seaborn scipy statsmodels

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
DATA_PATH = 'https://raw.githubusercontent.com/RFUNN/Lab/refs/heads/main/student_data.csv'

data = pd.read_csv(DATA_PATH)

In [None]:
data.head(20)

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data["parental_education_level"].fillna('No Data')

In [None]:
data.isnull().sum()

In [None]:
data.head(20)

In [None]:
data = data.drop(['student_id', 'age', 'online_movie_view_hours', 'attendance_percentage', 'sleep_hours', 'diet_quality', 'exercise_frequency', 'internet_quality', 'extracurricular_participation'], axis=1)

In [None]:
data.head(5)

In [None]:


le = LabelEncoder()

# Замена текстовых данных на числовые
data['gender'] = le.fit_transform(data['gender'])
data['mental_health_rating'] = le.fit_transform(data['mental_health_rating'])
data['part_time_job'] = le.fit_transform(data['part_time_job'])
data['parental_education_level'] = le.fit_transform(data['parental_education_level'])

In [None]:
data.head(20)

In [None]:
A = 0
B = 1
C = 2

data['exam_score'] = np.where(data['exam_score'] <= 65, A, data['exam_score']).astype(float)
data.loc[data['exam_score'] <= 65, 'exam_score'] = A
data.loc[(65 < data['exam_score']) & (data['exam_score'] <= 80), 'exam_score'] = B
data.loc[data['exam_score'] > 80, 'exam_score'] = C

In [None]:
data.head(5)

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


По таблице корреляции видно, что характеристика *exam_score* не коррелирует с *parental_education_level*, но коррелирует с *study_hours_per_day*

In [None]:
g = sns.pairplot(
    data,
    x_vars=["study_hours_per_day", "social_media_hours", 'parental_education_level', 'gender', 'mental_health_rating'],
    y_vars=["exam_score"],
)
for ax in g.axes.flat:
    ax.tick_params(axis='both', labelleft=True, labelbottom=True)


g.fig.set_size_inches(18,7)

plt.subplots_adjust(wspace=0.5, hspace=0.3)

sns.set_context("paper", rc={"axes.labelsize":26})


plt.show()

In [None]:
# Построение ящика с усами
plt.boxplot(data)
plt.ylabel('Значения')
plt.title('Ящик с усами')
plt.show()

In [None]:
# Построение ящика с усами
plt.boxplot([data['study_hours_per_day'], data['social_media_hours']])
plt.ylabel('Значения')
plt.title('Ящик с усами')
plt.show()

In [None]:
# Function to detect outliers using IQR
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)

# Detect outliers
outliers = detect_outliers_iqr(data['study_hours_per_day'])

In [None]:
data_cleaned = data[~outliers]

In [None]:
# Detect outliers
outliers = detect_outliers_iqr(data_cleaned['social_media_hours'])
data_cleaned = data_cleaned[~outliers]

In [None]:
data_cleaned.head(10)
print(len(data_cleaned['gender']), len(data_cleaned['study_hours_per_day']))

In [None]:
# Построение ящика с усами
plt.boxplot([data_cleaned['study_hours_per_day'], data_cleaned['social_media_hours']])
plt.ylabel('Значения')
plt.title('Ящик с усами')
plt.show()

In [None]:
correlation_matrix = data_cleaned.corr()
print(correlation_matrix['exam_score'])

In [None]:
def cost_function(X, y, theta):
    m = y.size
    error = np.dot(X, theta.T) - y
    cost = 1/(2*m) * np.dot(error.T, error)
    return cost, error

In [None]:
def gradient_descent(X, y, theta, alpha, iters):
    cost_array = np.zeros(iters)
    m = y.size
    for i in range(iters):
        cost, error = cost_function(X, y, theta)
        theta = theta - (alpha * (1/m) * np.dot(X.T, error))
        cost_array[i] = cost
    return theta, cost_array


In [None]:
def plotChart(iterations, cost_num):
    fig, ax = plt.subplots()
    ax.plot(np.arange(iterations), cost_num, 'r')
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Cost')
    ax.set_title('Error vs Iterations')
    plt.style.use('fivethirtyeight')
    plt.show()

# нормализуем

data = (data - data.mean()) / data.std()
data.head()

In [None]:
# Extract data into X and y
X = data_cleaned[['study_hours_per_day', 'social_media_hours', 'mental_health_rating']]
y = data_cleaned['exam_score']
y

In [None]:
model = LinearRegression()
model.fit(X, y)

In [None]:
model.coef_, model.intercept_

In [None]:
student_data = [2.9, 1.0, 6]
np.sum(model.coef_ * student_data) + model.intercept_ 

In [None]:
df = pd.DataFrame(X)
df['y'] = y
df['pred_fit'] = [round(el) for el in model.predict(X)]
df['pred_dot'] = X.dot(model.coef_) + model.intercept_

df.head()

In [None]:
df['residural'] = df['pred_fit'] - df['y']
df.head(20)

In [None]:
np.mean(df['residural'] ** 2) ## ошибка MSE

In [None]:
# создание стобца для внедрения весов
X = np.column_stack([X, np.ones((len(X)))])
X

$$MSE = \frac1n \sum_{i = 1}^{n} (y_i - X_i w)^2 $$

$$ \frac{\partial MSE}{\partial w} = \frac2n (Y - Xw) \cdot (-X) $$

In [None]:
def ErrorMSE(X, w, y):
    y_pred = X @ w
    return np.sum((y - y_pred)**2) / len(y_pred)

def GradientErrorMSE(X, w, y):
    y_pred = X @ w
    return 2 / len(X) * (y - y_pred) @ (-X)

In [None]:
weights = np.zeros(X.shape[1])
weights # 3 для признака, 1 для свободного коэф-та

In [None]:
dWeights = 0.00001

learning_rate = 0.005

next_weights = weights

N = 100000

for i in range(N):
    cur_weights = next_weights
    next_weights = cur_weights - learning_rate * GradientErrorMSE(X, cur_weights, y)
    print(GradientErrorMSE(X, cur_weights, y))
    print(f"Iter: {i}, Текущая точка: {cur_weights}, Следующая точка: {next_weights}")
    print(f"Целевая точка:{model.coef_} {model.intercept_}")
    print(f"MSE: {ErrorMSE(X, cur_weights, y)}\n")

    if np.linalg.norm(cur_weights - next_weights, ord=2) <= dWeights:
        break    

In [None]:
student_data = [2.9, 1.0, 6]
score_pred = np.sum(next_weights[:-1] * student_data) + next_weights[-1] 

EXAM_SCORES = [A, B, C]
EXAM_STUD = B

print(f"Ожидание: {EXAM_STUD}")
print(f"Предсказание: {score_pred}")

print(f"Модель: {next_weights}")

https://www.geeksforgeeks.org/machine-learning/interpreting-the-results-of-linear-regression-using-ols-summary/

https://ru.python-3.com/?p=3622

https://colab.research.google.com/drive/1Sh7SStUHCLS2Fiz0nrHJ3icGUIofPTH-?invite=CJDm5osD#scrollTo=qfUy8CGd2VVF

https://habr.com/ru/companies/otus/articles/752434/

https://habr.com/ru/articles/684580/

https://madewithml.com/courses/foundations/pandas/

https://www.geeksforgeeks.org/python/python-pandas-dataframe-fillna-to-replace-null-values-in-dataframe/

https://www.geeksforgeeks.org/python/how-to-delete-only-one-row-in-csv-with-python/

https://www.geeksforgeeks.org/data-analysis/working-with-missing-data-in-pandas/

https://www.geeksforgeeks.org/data-analysis/how-to-replace-values-in-column-based-on-condition-in-pandas/

https://youtu.be/KJA9A1q9l7E?si=1c54m7-qchea48O0