# Linear Regression

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import scipy.stats as stats


In [None]:
import kagglehub

# Download latest version
path = '/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv'

print("Path to dataset files:", path)

## Data Exploration and Preprocessing

In [None]:
import pandas as pd
df=pd.read_csv(path)
df.head(10)

In [None]:
df.shape


**the describtion of the data to see if there are outliers or not or is there any weird tendances in the data**

In [None]:
df.describe()


In [None]:
df.info()


In [None]:
#checking for nulls and duplicates
print(f"Number of nulls: {df.isnull().sum().sum()}")

print(f"Number of duplicates: {df.duplicated().sum()}")

df = df.drop_duplicates()
print(f"Number of duplicates after dropping: {df.duplicated().sum()}")


In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Encode categorical column as linear Regression needs the features to be numerical
le = LabelEncoder()
df['Extracurricular Activities'] = le.fit_transform(df['Extracurricular Activities'])

#label encoder gives and index to each unique category it the feature


In [None]:
#correlation matrix to see the correlations and dependancies between features if there are high correlations between 2 features then they are redundant and we can remove one of them
correlation_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix for Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
#We should drop sleep hours and sample Questions papers and Extracurricular Activities as they are not well correlated to the performance index
#df.drop(columns=['Extracurricular Activities','Sleep Hours','Sample Question Papers Practiced'], inplace=True) 


## Split the data and Normalization

In [None]:
# Features and target
y = df['Performance Index']
X = df.drop('Performance Index', axis=1)

# Standardize the features we do this so the model learns easier and no feature overpowers the other

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) #now all features are on the same playing field the model won't favour one over the other

# Split the data
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X_scaled, y, test_size=0.1, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


## Linear Regression Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Single feature
feature = 'Previous Scores'
X_single_train = X_train[[feature]]
X_single_test = X_test[[feature]]

lr_single = LinearRegression() #here we call the model
lr_single.fit(X_single_train, y_train)  #here we are fitting/training the model on our data

y_pred_single = lr_single.predict(X_single_test)  #and here we predict on the unseen test set


# Plot
plt.scatter(X_single_test, y_test, color='blue', label='Actual')
plt.plot(X_single_test, y_pred_single, color='red', label='Predicted')
plt.title('Single Linear Regression')
plt.xlabel(feature)
plt.ylabel('Performance Index')
plt.legend()
plt.grid(True)
plt.show()

# Metrics
r2_single = r2_score(y_test, y_pred_single)
mse_single = mean_squared_error(y_test, y_pred_single)
print(f"Single Linear Regression - R² Score: {r2_single:.4f}")
print(f"Single Linear Regression - Mean Squared Error: {mse_single:.4f}")


In [None]:
# All features Multi Linear regression
lr_multi = LinearRegression()
lr_multi.fit(X_train, y_train)

y_pred_multi = lr_multi.predict(X_test)

# Plot
plt.scatter(y_test, y_pred_multi, alpha=0.5, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title('Multiple Linear Regression')
plt.xlabel('Actual Performance Index')
plt.ylabel('Predicted Performance Index')
plt.grid(True)
plt.show()

# Metrics
r2_multi = r2_score(y_test, y_pred_multi)
mse_multi = mean_squared_error(y_test, y_pred_multi)
print(f"Multiple Linear Regression - R² Score: {r2_multi:.4f}")
print(f"Multiple Linear Regression - Mean Squared Error: {mse_multi:.4f}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

features = X_train.columns

coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': lr_multi.coef_
}).sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients, palette='viridis')
plt.title("Feature Importance in Multiple Linear Regression")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()

#here we visualize the feature importance like how we have seen from the covariance matrix 

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Store results for each degree
results = {}
degrees = [2, 3, 10]

# Loop through degrees
for degree in degrees:
    # Polynomial transformation
    poly = PolynomialFeatures(degree=degree)
    X_poly_train = poly.fit_transform(X_train)
    X_poly_test = poly.transform(X_test)

    # Train model
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

    # Predict
    y_pred = model.predict(X_poly_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save results
    results[degree] = {
        'y_pred': y_pred,
        'mse': mse,
        'r2': r2
    }

# Plot all 3 predictions
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
for idx, degree in enumerate(degrees):
    ax = axes[idx]
    y_pred = results[degree]['y_pred']

    ax.scatter(y_test, y_pred, alpha=0.6, label=f'Degree {degree}', color=f'C{idx}')
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    ax.set_title(f'Polynomial Degree {degree}')
    ax.set_xlabel('Actual Performance Index')
    if idx == 0:
        ax.set_ylabel('Predicted Performance Index')
    ax.legend()
    ax.grid(True)

plt.suptitle('Polynomial Regression with All Features', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Print scores
for degree in degrees:
    print(f"Degree {degree}:")
    print(f"  R² Score: {results[degree]['r2']:.4f}")
    print(f"  Mean Squared Error: {results[degree]['mse']:.4f}")
    print()


* At first, increasing the model complexity (e.g., increasing the degree of polynomial features, using a more complex model like a higher-order polynomial regression or a more powerful machine learning algorithm) can improve the performance because the model is better able to capture the underlying patterns in the data
* However, as the model becomes more complex, it may start to capture not just the true underlying patterns but also the noise in the training data. This means that the model fits the specific characteristics of the training data too closely and doesn't generalize well to unseen data. This leads to overfitting.

* Overfitting occurs when a model performs well on training data but poorly on test data (or new, unseen data). This is because the model has learned to model the training data too well, including noise and outliers, which don't generalize to new data.

# Logistic Regression

## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
data = pd.read_csv('/kaggle/input/student-depression-dataset/Student Depression Dataset.csv')
data.head()

## Exploring and preprocessing of the data

In [None]:
print(data.info())
print(data.describe())

In [None]:
print("Count Duplicates :" , data.duplicated().sum())
print("Count null values:")
print(data.isnull().sum())
data.dropna(inplace=True)
print(data.isnull().sum())

sns.heatmap(data.isnull())

In [None]:
data.drop(["id"], axis=1, inplace=True)  # Remove irrelevant column
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data

In [None]:
data['Depression'].value_counts()   #okay none balanced data shoud either downsample or upsample
#garabt a3ml downsample gab 83 acc
#upsampling gab 85 acc logistic bs howa fara2 fel knn gabet 77acc fa a win is a win



In [None]:
data['Depression'] = data['Depression'].astype('category',copy=False)
colors = ['purple', 'red']
plot = data['Depression'].value_counts().plot(kind='bar', title="Class distributions \n(0: Not Depressed | 1: Depressed)", color=colors)
fig = plot.get_figure()

In [None]:
from sklearn.utils import resample

# Separate classes
majority = data[data['Depression'] == 1]
minority = data[data['Depression'] == 0]

# Upsample minority class
minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=42)

# Combine with majority class
data_balanced = pd.concat([majority, minority_upsampled])

# Shuffle the result
data = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check balance
print(data['Depression'].value_counts())


In [None]:
data['Depression'] = data['Depression'].astype('category',copy=False)
colors = ['purple', 'red']
plot = data['Depression'].value_counts().plot(kind='bar', title="Class distributions \n(0: Not Depressed | 1: Depressed)", color=colors)
fig = plot.get_figure()

In [None]:
# Calculate the correlation matrix for numerical features
numeric_data = data.select_dtypes(include=np.number)
correlation_matrix = numeric_data.corr()

# Display the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

#7ane3ml remove le either work pressure aw job satisfaction highly correlated redundant feature
data.drop(['Work Pressure'], axis=1, inplace=True)


In [None]:
plt.figure(figsize=(15, 10))
numerical_features = ['Academic Pressure', 'Age', 'CGPA', 'Financial Stress', 'Work/Study Hours','Study Satisfaction']
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x='Depression', y=feature, data=data, palette=colors)
    plt.title(f'{feature} Distribution by Depression Status')
plt.tight_layout()
plt.show()

In [None]:
numerical_features = ['Academic Pressure', 'Age', 'CGPA', 'Financial Stress', 'Work/Study Hours','Study Satisfaction']

for feature in numerical_features:
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    print(f"{feature}: {len(outliers)} outliers")
#mashy fy outliers bs msh keteer w kman they hold info fa msh 7anshelhom
#Extreme values (e.g. very high academic pressure or very low CGPA) could be early indicators or strong correlates of depression.
#Removing these points might bias your model toward average cases and reduce its ability to detect serious situations.


In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
print(data.info())
#i want to drop  'Other' rows with other as they provide no information
data.drop(data[data['City'] == 'Other'].index, inplace=True)
data.drop(data[data['Sleep Duration'] == 'Other'].index, inplace=True)
data.drop(data[data['Dietary Habits'] == 'Other'].index, inplace=True)
data.drop(data[data['Degree'] == 'Other'].index, inplace=True)
data["City"] = labelencoder.fit_transform(data["City"])
data["Depression"] = labelencoder.fit_transform(data["Depression"])

data["Gender"] = labelencoder.fit_transform(data["Gender"])
data["Sleep Duration"] = labelencoder.fit_transform(data["Sleep Duration"])
data["Dietary Habits"] = labelencoder.fit_transform(data["Dietary Habits"])
data["Have you ever had suicidal thoughts ?"] = labelencoder.fit_transform(data["Have you ever had suicidal thoughts ?"])
data["Family History of Mental Illness"] = labelencoder.fit_transform(data["Family History of Mental Illness"])
data["Degree"] = labelencoder.fit_transform(data["Degree"])
data.head()

data.drop(['Profession'], axis=1, inplace=True)


## Splitting and normalization

In [None]:
# Prepare the model
y = data["Depression"] # our target variable
X = data.drop(["Depression"], axis=1) # our predictors

In [None]:
# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
X_scaled = data[['Academic Pressure','Age','CGPA','Financial Stress','Work/Study Hours','Job Satisfaction','Study Satisfaction']]
X_scaled = scaler.fit_transform(data[['Academic Pressure','Age','CGPA','Financial Stress','Work/Study Hours','Job Satisfaction','Study Satisfaction']])
# X_scaled is now a numpy array with normalized data

In [None]:
data.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Classification models and Comparing between them

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report



models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred))
    print("\n")


In [None]:
#confusion matrices for each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=model.classes_)
    disp.plot()
    plt.title(f"Confusion Matrix - {name}")
    plt.show()
