# **Prediction with Regression**

## Objectives

* Fit and evaluate a regression model to predict Students Exam Score.

## Inputs

* outputs/datasets/collection/StudentPerformance.csv
* Instructions on which variables to use for data cleaning and feature engineering. They are found in their respective notebooks.

## Outputs

* Train set (features and target)
* Test set (features and target)
* ML pipeline to predict Exam Score
<!-- * labels map
* Feature Importance Plot -->


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

-----

# Load data

In [None]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/StudentPerformance.csv")
      )

print(df.shape)
df.head(3)

---

# ML Pipline: Regressor

## Create ML pipeline 

In [9]:
from sklearn.pipeline import Pipeline

# Cleaning
from feature_engine.imputation import CategoricalImputer

# Feature Engineering
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.discretisation import EqualFrequencyDiscretiser 

# Feat Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor


In [6]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),
        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

Custom Class for hyperparameter optimisation

In [5]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineOptimization(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches


---

## Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(['Exam_Score'], axis=1) # Defining the features
y = df['Exam_Score'] # Defining the the target for the prediction

X_train, X_test, y_train, y_test = train_test_split(
     X,
     y,
     test_size=0.2,
     random_state=0
 )

print("* Train set:", X_train.shape, y_train.shape,
       "\n* Test set:",  X_test.shape, y_test.shape)

## Grid Search CV - Sklearn

In [20]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}


params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

Do a hyperparameter optimisation search using default hyperparameters

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

Checking the resulte

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Since LinearRegssion doesn't have so many hyperparameter, the model GradientBoostRegressor will also be explored to see if the Model be improved.

In [9]:
models_search = {
    'LinearRegression': LinearRegression(),
}

params_search = {
    'LinearRegression': {
        'model__positive': [True, False],
        'model__n_jobs':[None, 1, 2]
    },
    
}

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)  

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

### Quick Conclusion

To find a model with a mean score of at least 0.8, as specified in the business case, the data underwent cleaning and feature engineering, which were detailed in previous notebooks. Various models were tested, and it was observed that the model with the highest mean score was LinearRegression, achieving approximately 0.64, followed by GradientBoostingRegressor with a score of around 0.55. 

Due to LinearRegression having very few hyperparameters, GradientBoostingRegressor was also explored to see if its performance could be enhanced through hyperparameter tuning. In the first attempt to improve the model using hyperparameters, it was evident that LinearRegression still maintained the highest mean score, although there was no improvement in the score itself. 

In the second attempt, the LinearRegression model was removed, and more hyperparameters were introduced to assess potential improvements. However, the results indicated that LinearRegression continued to have the best mean score.

**Score for the LinearRegresson and the quick search**
|estimator | min_score | mean_score | max_score | std_score | 
|:----     |----       |----        |----       |----       |
|LinearRegression |	0.494062 | 0.641766 | 0.764155 | 0.089603 |

**Attempt two to improve the model with the hyperparameters**
|no. |estimator | min_score | mean_score | max_score | std_score | 
|:---|:----     |----       |----        |----       |----       |
|1242 | GradientBoostingRegressor | 0.446847 | 0.556504 | 0.663426 | 0.072982 |
|1134 | GradientBoostingRegressor | 0.446847 | 0.556504 | 0.663426 |	0.072982 |	
|1260 | GradientBoostingRegressor | 0.44674  | 0.556464 | 0.663597 | 0.073082 |	
|1152 | GradientBoostingRegressor | 0.44674  | 0.556464 | 0.663597 |	0.073082 |	
|1251 | GradientBoostingRegressor | 0.44674 | 0.556446 | 0.663411 | 0.073031 |


------

## Collect best model

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

In [None]:
best_params = grid_search_pipelines[best_model].best_params_
best_params

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
print(best_regressor_pipeline)

In [None]:
X_train = X_train.filter(list_best_feat)
X_test = X_test.filter(list_best_feat)

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)
X_train.head(3)

In [44]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5) 

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

### Evaluate Regresson on Train and Test Sets

In [42]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)


def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print('R2 Score:', r2_score(y, prediction).round(3))
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))
    print('Root Mean Squared Error:', np.sqrt(
        mean_squared_error(y, prediction)).round(3))
    print("\n")


def regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, alpha_scatter=0.5):
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    sns.scatterplot(x=y_train, y=pred_train, alpha=alpha_scatter, ax=axes[0])
    sns.lineplot(x=y_train, y=y_train, color='red', ax=axes[0])
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predictions")
    axes[0].set_title("Train Set")

    sns.scatterplot(x=y_test, y=pred_test, alpha=alpha_scatter, ax=axes[1])
    sns.lineplot(x=y_test, y=y_test, color='red', ax=axes[1])
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predictions")
    axes[1].set_title("Test Set")

    plt.show()

In [None]:
regression_performance(X_train, y_train, X_test, y_test, best_regressor_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_regressor_pipeline)

## Conclusion

The graph above shows that data points with higher values are predicted to be much lower than the actual values. Additionally, it is evident that there are no predicted scores over 80. This phenomenon can be attributed to the distribution of the target variable, 'Exam_Score', which appears to be normally distributed between 50 and 75, but features a long tail. While it is possible to remove this tail, doing so would undermine the purpose of the prediction. This also helps explain the high value of the Root Mean Squared Error. 

Even though this is not the pipeline with the highest mean score this is the pipeline that has the least overfitting and this is the reason for the pick of the pipeline.

In [None]:
models_search

In [None]:
best_params

In [None]:
params_search = {'LinearRegression': {} }
params_search

In [13]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),

        ("feat_scaling", StandardScaler()),

        ("model", model),

    ])

    return pipeline_base

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5) 

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
print(pipeline_clf)

In [None]:
regression_performance(X_train, y_train, X_test, y_test, pipeline_clf)
regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline_clf)

----------

# Push files to the repo

We will generate the following files

- Train set
- Test set
- Modeling pipeline

In [33]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_exam_score/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

In [34]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

In [35]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

In [None]:
pipeline_clf

In [None]:
joblib.dump(value=pipeline_clf, filename=f"{file_path}/clf_pipeline.pkl")

**Belowe can other pipelines be seen and attempts to improve the precision of the model**

- Missing values to most frequent
- Use algorithm KNeighborsRegressor
- Use PCA to build the model
- Create an ML pipeline with a Classification 

--------

## Create ML pipeline with Missing values to most fequent

In [18]:
from feature_engine.imputation import CategoricalImputer, DropMissingData

def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='frequent',
                                                   variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

### Quick Conclusion

In this section, we replaced the missing values with the most frequent value for each specific feature, instead of labeling them as 'Missing'. This approach aimed to improve the model's performance. We tested various models; however, further investigation with hyperparameters was determined to be unnecessary. The results showed that changing the NaN values to the most frequent values in the features had no significant impact. Since there were no major differences observed, the subsequent tests will use the method of replacing NaN values with 'Missing'.

The next step in this notebook is to explore a model called KNeighborsRegressor.

## Create Pipline modified for KNeighborsRegressor

The KNeighborsRegressor is a non-parametric regression method that predicts values based on the k closest training data points. It stores the training data and uses a distance metric, usually Euclidean distance, to make predictions.

For each prediction, the average of the target values of the k nearest points is calculated. This can be done with uniform weighting or by considering distance. Choosing the right value for k is crucial: smaller values may result in high variance, while larger values can lead to high bias. Therefore, the function below examines the optimal k-value, along with the mean R² value.

In [13]:
def PipelineOptimization_KNN(model):
    steps = [
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),
        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        ("YeoJohnson", YeoJohnsonTransformer(['Attendance', 'Tutoring_Sessions'])),
        ("feat_scaling", StandardScaler())
    ]
    
    # Add feature selection step only if the model has feature_importances_ or coef_ attributes
    if hasattr(model, 'feature_importances_') or hasattr(model, 'coef_'):
        steps.append(("feat_selection", SelectFromModel(model)))

    steps.append(("model", model))
    
    pipeline_base = Pipeline(steps)
    return pipeline_base

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np


def Predict_k_value(X_train, y_train, X_test, y_test, k_max):
    knn_r_score=[]

    for i in range(1, k_max):
        # Create and fit the KNeighborsRegressor
        knn_pipeline = PipelineOptimization_KNN(KNeighborsRegressor(n_neighbors=i, weights='distance', algorithm='ball_tree'))
        knn_pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = knn_pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        knn_r_score.append(r2)

    print(f'The max value is {max(knn_r_score)} and as the k-value {knn_r_score.index(max(knn_r_score))}')
    plt.plot(range(1, k_max), knn_r_score)
    plt.show

Predict_k_value(X_train, y_train, X_test, y_test, 201)

### Quick Conclusion

As previously discussed, an alternative approach was attempted to improve the model. In this case, the KNeighborsRegressor model was utilized, and a function was created to determine the optimal k value. The most suitable k-value found was 19, however, the mean score for the R² value was approximately 0.47, which is much lower than the scores obtained in previous sections.

In the next section, a pipeline will be developed using PCA to enhance the model.

## Create ML Pipline with PCA

In [8]:
df = (pd.read_csv("outputs/datasets/collection/StudentPerformance.csv")
      )

In [12]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),

        ("feat_scaling", Normalizer()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

In [None]:
pipeline = PipelineOptimization(model=LinearRegression())
pipeline_pca = Pipeline(pipeline.steps[:4])
df_pca = pipeline_pca.fit_transform(df.drop(['Exam_Score'], axis=1))

print(df_pca.shape,'\n', type(df_pca))

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

n_components = 19


def pca_components_analysis(df_pca, n_components):
    pca = PCA(n_components=n_components).fit(df_pca)
    x_PCA = pca.transform(df_pca)  # array with transformed PCA

    ComponentsList = ["Component " + str(number)
                      for number in range(n_components)]
    dfExplVarRatio = pd.DataFrame(
        data=np.round(100 * pca.explained_variance_ratio_, 3),
        index=ComponentsList,
        columns=['Explained Variance Ratio (%)'])

    dfExplVarRatio['Accumulated Variance'] = dfExplVarRatio['Explained Variance Ratio (%)'].cumsum(
    )

    PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum(
    )

    print(
        f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
    plt.figure(figsize=(12, 5))
    sns.lineplot(data=dfExplVarRatio,  marker="o")
    plt.xticks(rotation=90)
    plt.yticks(np.arange(0, 110, 10))
    plt.show()


pca_components_analysis(df_pca=df_pca, n_components=n_components)


In [None]:
n_components = 5
pca_components_analysis(df_pca=df_pca, n_components=n_components)

### Rewrite ML Pipline for Modelling

In [22]:
from sklearn.decomposition import PCA


def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                     variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                                'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                                'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                                'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),

        ("feat_scaling", Normalizer()),

        # PCA replace Feature Selection
        ("PCA", PCA(n_components=4, random_state=0)),

        ("model", model),

    ])

    return pipeline_base

In [None]:
print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

In [None]:
quick_search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
quick_search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary

### Quick Conclusion

The model was not approved by training the model with PCA . It can also be seen that all of the components is needed to be close to the mean score in that can be find in the first attempt. Even with trying to Normaliz the data instead of using StandardScaler to optimize the model, which did not improve the model.

Next step, Try to improve the model by using a classifier insted of a regresson.

-------

## Recreate Pipline and tagets to Classification

In [7]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([
        ('categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                    fill_value='Missing',
                                                    variables=['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home'])),

        ("Ordinal_Encoder", OrdinalEncoder(encoding_method='arbitrary',
                                           variables=['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                                                      'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                                                      'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                                                      'Distance_from_Home', 'Gender'])),
        
        ("YeoJohnson", YeoJohnsonTransformer(variables=['Attendance', 'Tutoring_Sessions'])),
        ("feat_scaling", StandardScaler()),
        ("feat_selection",  SelectFromModel(model)),
        ("model", model),
    ])

    return pipeline_base

def PipelineCleaning(q):
    pipline_cleaned = Pipeline([
        ('efd', EqualFrequencyDiscretiser(q=q, variables=['Exam_Score'] )),
    ])

    return pipline_cleaned

In [None]:
def find_query_size(df, PipelineCleaning):
    """
    This funsction find the number of querys the Dataframe should be
    devided in to have the least amount of diffrens between the number 
    of datapoints in the querys.

    return: int
            The number in querys 

    """

    max_diff_list = []

    for i in range(2,14):
        pipeline_taget_cleaning = PipelineCleaning(i)
        df_target_bins = pipeline_taget_cleaning.fit_transform(df)
        max_diff = df_target_bins['Exam_Score'].value_counts().max()- df_target_bins['Exam_Score'].value_counts().min()

        max_diff_list.append(max_diff)
    
    print(f'The nummer of {max_diff_list.index(min(max_diff_list))+2} bins with the lowest max diffrens, that is {min(max_diff_list)}.')
    
    return max_diff_list.index(min(max_diff_list))+2

find_query_size(df, PipelineCleaning)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

pipeline_taget_cleaning = PipelineCleaning(5)


df_target_bins = pipeline_taget_cleaning.fit_transform(df)

print(df_target_bins[])

target_span = pipeline_taget_cleaning['efd'].binner_dict_

sns.countplot(data=df_target_bins, x='Exam_Score')
plt.show()


X = df_target_bins.drop(['Exam_Score'], axis=1) # Defining the features
y = df_target_bins['Exam_Score'] # Defining the the target for the prediction

X_train, X_test, y_train, y_test = train_test_split(
     X,
     y,
     test_size=0.2,
     random_state=0
 )

print("* Train set:", X_train.shape, y_train.shape,
       "\n* Test set:",  X_test.shape, y_test.shape)
print(target_span)

In [None]:
df_target_bins.head(10)

In [13]:
# ML algorithms
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

models_quick_search = {
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "XGBClassifier":{},
    "DecisionTreeClassifier":{},
    "RandomForestClassifier":{},
    "GradientBoostingClassifier":{},
    "ExtraTreesClassifier":{},
    "AdaBoostClassifier":{},
}

In [None]:
from sklearn.metrics import make_scorer, recall_score
quick_search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
quick_search.fit(X_train, y_train,
                 scoring = make_scorer(recall_score, labels=[0], average=None),
                 n_jobs=-1,
                 cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary

In [16]:
models_search = {
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
}

params_search = {
    "ExtraTreesClassifier": {
        "model__max_depth": [10, 20, 30],
        "model__min_samples_split": [5, 10],
        "model__min_samples_leaf": [4, 10],
        "model__max_features": [0.5, "sqrt"],
        "model__max_leaf_nodes": [50, 100],
        "model__random_state": [42]

    }
}


In [None]:
search = HyperparameterOptimizationSearch(
    models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring=make_scorer(recall_score, labels=[0], average=None),
           n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary


### Quick Conclusion

After changing from the model being a to a regrassion 

# Optimize ML


In [None]:
best_model = grid_search_summary.iloc[2,0]
best_model

In [None]:
grid_search_pipelines[best_model].best_params_

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

# Assess Feature importance

In [None]:
# after data cleaning and feat engine, the feature may space changes
# how much data cleaning and feature engineering does your pipeline have?
data_cleaning_feat_eng_steps = 2
columns_after_data_cleaning_feat_eng = (Pipeline(pipeline_clf.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

# best_features = columns_after_data_cleaning_feat_eng
best_features = columns_after_data_cleaning_feat_eng[pipeline_clf['feat_selection'].get_support(
)].to_list()

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns_after_data_cleaning_feat_eng[pipeline_clf['feat_selection'].get_support()],
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# reassign best features in order
best_features = df_feature_importance['Feature'].to_list()

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{best_features}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

## Evaluate Classifier on Train and Test Sets

In [24]:
from sklearn.metrics import classification_report, confusion_matrix


def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print('---  Confusion Matrix  ---')
    print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
          columns=[["Actual " + sub for sub in label_map]],
          index=[["Prediction " + sub for sub in label_map]]
          ))
    print("\n")

    print('---  Classification Report  ---')
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)

In [None]:
target_span

In [None]:
label_map = ['<64.0', '64.0 to 66.0', '66.0 to 68.0', '68.0 to 70.0','+70.0']
label_map

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                        X_test=X_test, y_test=y_test,
                        pipeline=pipeline_clf,
                        label_map= label_map )

---