In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib

In [47]:
import os
os.getcwd()

'/Users/mschonhofer/PycharmProjects/RP2/backend/app'

In [48]:
df = pd.read_csv("../../data/Data.csv")
df.head()

Unnamed: 0,Department,Gender,HSC,SSC,Income,Hometown,Computer,Preparation,Gaming,Attendance,Job,English,Extra,Semester,Last,Overall
0,Business Administration,Male,4.17,4.84,"Low (Below 15,000)",Village,3,More than 3 Hours,0-1 Hour,80%-100%,No,3,Yes,6th,3.22,3.35
1,Business Administration,Female,4.92,5.0,"Upper middle (30,000-50,000)",City,3,0-1 Hour,0-1 Hour,80%-100%,No,3,Yes,7th,3.467,3.467
2,Business Administration,Male,5.0,4.83,"Lower middle (15,000-30,000)",Village,3,0-1 Hour,More than 3 Hours,80%-100%,No,4,Yes,3rd,4.0,3.72
3,Business Administration,Male,4.0,4.5,"High (Above 50,000)",City,5,More than 3 Hours,More than 3 Hours,80%-100%,No,5,Yes,4th,3.8,3.75
4,Business Administration,Female,2.19,3.17,"Lower middle (15,000-30,000)",Village,3,0-1 Hour,2-3 Hours,80%-100%,No,3,Yes,4th,3.94,3.94


In [49]:
df.columns.tolist()

['Department',
 'Gender',
 'HSC',
 'SSC',
 'Income',
 'Hometown',
 'Computer',
 'Preparation',
 'Gaming',
 'Attendance',
 'Job',
 'English',
 'Extra',
 'Semester',
 'Last',
 'Overall']

In [64]:
df.isna().sum()

Department         0
Gender             0
HSC                0
SSC                0
Income             0
Hometown           0
Computer           0
Preparation        0
Gaming             0
Attendance         0
Job                0
English            0
Extra              0
Semester           0
Last               0
Overall            0
target_stem        0
Income_clean       0
semester_num       0
semester_norm      0
academic_score     0
self_discipline    0
income_norm        0
hometown_norm      0
gender_norm        0
dtype: int64

In [51]:
df["target_stem"] = df["Department"].isin([
    "Computer Science and Engineering"
]).astype(int)

df["target_stem"].value_counts()

target_stem
1    443
0     50
Name: count, dtype: int64

In [52]:
df["Income_clean"] = (
    df["Income"]
    .astype(str)
    .str.strip()
    .str.replace("\u00a0", " ", regex=False)
    .str.replace(r"\s+", " ", regex=True)
)

In [53]:
prep_map = {
    "0-1 Hour": 0.2,
    "2-3 Hours": 0.6,
    "More than 3 Hours": 0.9,
}

gaming_map = {
    "0-1 Hour": 0.85,
    "2-3 Hours": 0.55,
    "More than 3 Hours": 0.25,
}

attendance_map = {
    "Below 40%": 0.15,
    "40%-59%": 0.4,
    "60%-79%": 0.7,
    "80%-100%": 0.95,
}

yesno_map = {"Yes": 1.0, "No": 0.0}

income_map = {
    "Low (Below 15,000)": 0.2,
    "Lower middle (15,000-30,000)": 0.45,
    "Upper middle (30,000-50,000)": 0.7,
    "High (Above 50,000)": 0.9,
}

hometown_map = {
    "Village": 0.3,
    "Town": 0.5,
    "City": 0.7,
    "Other": 0.6,
}

gender_map = {
    "Male": 1.0,
    "Female": 0.0,
    "Other": 0.5,
}

In [54]:
for col in ["Preparation", "Attendance", "Gaming", "Job", "Extra"]:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.replace("\u00a0", " ", regex=False)
        .str.replace(r"\s+", " ", regex=True)
    )

In [55]:
df["semester_num"] = (
    df["Semester"]
    .str.extract(r"(\d)")
    .astype(float)
)

df["semester_norm"] = df["semester_num"] / df["semester_num"].max()

In [56]:
df["academic_score"] = (
    0.25 * df["SSC"] +
    0.25 * df["HSC"] +
    0.25 * df["Last"] +
    0.25 * df["Overall"]
)

df["academic_score"] /= df["academic_score"].max()

In [57]:
df["self_discipline"] = (
    0.30 * df["Preparation"].map(prep_map) +
    0.25 * df["Attendance"].map(attendance_map) +
    0.20 * df["Gaming"].map(gaming_map) +
    0.10 * (1 - df["Job"].map(yesno_map)) +
    0.15 * df["Extra"].map(yesno_map)
)

In [58]:
df["income_norm"] = df["Income_clean"].map(income_map)

df["hometown_norm"] = (
    df["Hometown"].astype(str).str.strip().str.replace("\u00a0", " ", regex=False)
)
df["hometown_norm"] = df["hometown_norm"].map(hometown_map)

df["gender_norm"] = (
    df["Gender"].astype(str).str.strip().str.replace("\u00a0", " ", regex=False)
)
df["gender_norm"] = df["gender_norm"].map(gender_map)

In [59]:
df[["income_norm", "self_discipline", "hometown_norm", "gender_norm"]].isna().sum()

income_norm        0
self_discipline    0
hometown_norm      0
gender_norm        0
dtype: int64

In [65]:
df["computer_skill"] = df["Computer"].astype(float) / 5
df["english_skill"] = df["English"].astype(float) / 5

# jeśli SSC/HSC/Last/Overall są na skali 0–5 to academic_score już ok
# jeśli na 0–100, nadal OK bo dzielisz przez max

FEATURES = [
    "self_discipline",
    "academic_score",
    "computer_skill",
    "english_skill",
    "income_norm",
    "hometown_norm",
    "gender_norm",
    "semester_norm",
]

X = df[FEATURES]
y = df["target_stem"]

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [73]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

pipe.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [72]:
from sklearn.metrics import confusion_matrix

pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=3))

Accuracy: 0.8888888888888888
ROC-AUC: 0.8325842696629212
[[ 0 10]
 [ 1 88]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        10
           1      0.898     0.989     0.941        89

    accuracy                          0.889        99
   macro avg      0.449     0.494     0.471        99
weighted avg      0.807     0.889     0.846        99



In [70]:
coefs = pd.Series(
    pipe.named_steps["clf"].coef_[0],
    index=FEATURES
).sort_values(ascending=False)

coefs

income_norm        0.480187
computer_skill     0.395016
english_skill      0.230360
gender_norm        0.117256
semester_norm      0.094502
academic_score     0.091826
hometown_norm     -0.401480
self_discipline   -0.941031
dtype: float64

In [76]:
joblib.dump(pipe, "../models/stem_model.joblib")

['../app/models/stem_model.joblib']