# STEP BY STEP TUTORIAL HOW TO IMPLEMENT XGBOOST ML MODEL
# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●


# Part 1 

# IMPORTING THE DATA







### Import data from files

In [None]:
import xgboost as xgb
import pandas as pd


pd.read_csv('filename.csv') # reading CSV file
pd.read_excel('filename.xlsx') #reading excel file
pd.read_json('filename.json') #reading json file
pd.read_html('filename.html') #reading html file



### Import data from SQL Server 

In [4]:
import pypyodbc as odbc

DRIVER_NAME = 'SQL Server'
SERVER_NAME = 'server name'
DATABASE_NAME = 'Database name'

connection_string = f"""
    DRIVER={{{DRIVER_NAME}}};
    SERVER={SERVER_NAME};
    DATABASE={DATABASE_NAME};
    Trusted_Connection=yes;
"""   

conn = odbc.connect(connection_string)
print(conn)


--------------------------------------------------------------------------------

import pandas as pd
query = 'SELECT * FROM [TableScheme].[Table]'
df = pd.read_sql(query, odbc.connect(connection_string))

# Print the DataFrame
df

<pypyodbc.Connection object at 0x000002428443BF40>


### Import data from URL File

In [None]:
pd.read_csv('https://example.com/data.csv') #reads URL file

# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●

# Part 2

# Data manipulation  & Data cleaning 

#### Number of rows and columns 

In [None]:
df.shape #shape, how many rows and columns

#### Data Types 

In [None]:
df.info()

#### Unique Values 

In [None]:
df['column name'].unique()

# output: ([array of unique numbers]), dtype)

#### Missing values 

In [None]:
df.isna().sum()   # Check for missing values

#### Column names 

In [None]:
df.columns  #To get the column names 

#### Description of values 

In [None]:
df.describe()

#OUTPUT:

# count
# mean
# std
# min
# 25%
# 50%
# 75%
# max


#### Check for duplicated values 

In [None]:
for col in df.columns:
    num_duplicates = df.duplicated(subset=col).sum()                        #what columns do have duplciates in them
    print(f"Column '{col}' has {num_duplicates} duplicates")

#### Sorting the values 

In [None]:
df.sort_values(by="Amount", ascending=False).head() #single column, in this case Amount
df.sort_values(by=["Churn", "Total day charge"], ascending=[True, False]).head()  #multiple columns

#### replacing values in dataset 

In [None]:
df['column name'].replace('vale to be replaced', 'replace value', inplace=True)

#### Drop the column 

In [None]:
#drop 1 column
df.drop('column name', axis=1, inplace=True)

# drop mutliple columns
columns_to_drop = ['column_name_1', 'column_name_2']

# Drop the specified columns from the DataFrame
df.drop(columns_to_drop, axis=1, inplace=True)

#### Rename columns 

In [None]:
df.rename(columns={'actual column name': 'new column name'}, inplace=True)

# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●

# Part 3




# Label encoding 


Machine learning models are able to process only nuemrical data, that emans that you need to transform the data taht are strings into numerical values so the model then can learn and predict.

In this case we encode the columns and print out also their previous values so we can see alter on how the data was transformed


In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['categorical_column1', 'categorical_column2', 'categorical_column3'] #Just examples, replace with the columns you need to encode

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Create a dictionary to store the encoded labels
encoded_labels = {}

# Perform label encoding for each categorical column
for column in categorical_columns:
    encoded_labels[column] = dict(zip(label_encoder.fit(df[column]).classes_, label_encoder.transform(label_encoder.fit(df[column]).classes_)))
    df[column] = label_encoder.transform(df[column])


# Display the original values and their corresponding encoded labels for each categorical column
for column, encoding_map in encoded_labels.items():
    print(f"Original values for {column}: {encoding_map}")
    print()


# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●

# Part 4

# Model training

### Defining the features 

In [None]:


# X should contain all the features that you want to use as input to your machine learning model for prediction
X = df.drop(columns=['Column names'])
# Y is a Series containing the target variable (dependent variable) that you want to predict.
y = df['column name']


### Splitting the dataset 

In [None]:
from sklearn.model_selection import train_test_split
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


###  Standard Scaling - NOT MANDATORY!! 

The purpose of scaling the features using StandardScaler is to ensure that all features have the same scale (mean = 0 and standard deviation = 1). This is particularly important for algorithms that are sensitive to the scale of the features, such as gradient descent-based algorithms (e.g., linear regression, logistic regression) and distance-based algorithms (e.g., k-nearest neighbors). Normalizing the features can improve the performance and convergence of these algorithms.

In [None]:
from sklearn.preprocessing import StandardScaler
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○● 

# Part 5 

# Machine Learning models in XGBoost

## Classification Models:

### Binary Classification: 

Target Variable: The dataset should have a binary target variable with two classes (e.g., 0 and 1, 'Yes' and 'No').

Class Balance: Ensure that the dataset has a balanced distribution of the two classes to prevent bias towards the majority class.

Evaluation Metric: Common evaluation metrics include accuracy, precision, recall, F1-score, and ROC-AUC.


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier() #(binary classification using decision trees)

model = XGBRFClassifier() #(binary classification using random forests)

### Multiclass Classification: 

Target Variable: The dataset should have a target variable with more than two classes (e.g., 3 or more).

Sufficient Samples: Make sure there are enough samples for each class to avoid overfitting and under-represented classes.

Data Format: The target variable should be categorical with integer labels representing different classes.

Evaluation Metric: Common evaluation metrics include accuracy, macro/micro F1-score, and confusion matrix.

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='multi:softmax') #with objective='multi:softmax' (multiclass classification using decision trees)

model = XGBRFClassifier(objective='multi:softmax') #with objective='multi:softmax' (multiclass classification using random forests)

### Probability Estimation:
    
Similar to binary classification, but the output will be probabilities of the positive class (e.g., likelihood of 'Yes').

Probability Threshold: You can set a probability threshold to convert probabilities into class predictions (e.g., 0.5 for binary classification).    

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='binary:logistic') #with objective='binary:logistic' (binary classification with probability outputs)

## Regression Models:

Target Variable: The dataset should have a continuous (numeric) target variable representing the value to be predicted.

Data Format: The features should be numerical and contribute to predicting the target variable.

Evaluation Metric: Common evaluation metrics include mean squared error (MSE), mean absolute error (MAE), and R-squared.


In [None]:
from xgboost import XGBRegressor

model = XGBRegressor() #(regression using decision trees)

model = XGBRFRegressor() #(regression using random forests)

## Ranking Models:

Dataset Format: Suitable for learning-to-rank problems, where instances are ranked based on relevance or preference.

Features: The dataset may include features representing user-item interactions, user preferences, and item attributes.

Evaluation Metric: Ranking metrics like NDCG (Normalized Discounted Cumulative Gain) and MAP (Mean Average Precision) are commonly used.

In [None]:
from xgboost import XGBRanker

model = XGBRanker() #(ranker for learning-to-rank problems)

## Anomaly Detection Models:

Dataset Format: The dataset should be labeled with binary classes (e.g., normal and anomaly).

Class Imbalance: Anomaly detection often deals with class imbalance, where anomalies are a small fraction of the total samples.

Evaluation Metric: Metrics like precision, recall, and F1-score are used to assess anomaly detection performance.


In [None]:
from xgboost import XGBClassifier

model = xgb.XGBClassifier(objective=anomaly_objective) #with custom objective function (anomaly detection with one-class classification)

## Time Series Forecasting Models:

Dataset Format: The dataset should be time-ordered, with the target variable being the variable to forecast at a future time.

Time-related Features: Additional time-related features like lags or time-based statistics can be useful for time series modeling.

Evaluation Metric: Common evaluation metrics include mean absolute error (MAE), mean squared error (MSE), and root mean squared error (RMSE).

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor() #with custom features (time series forecasting with lag features)

## Ranking and Recommendation Models:

Dataset Format: Suitable for personalized ranking or recommendation, often involving implicit or explicit feedback.

Evaluation Metric: Ranking and recommendation models typically use metrics like NDCG, MAP, and AUC to evaluate their performance.

In [None]:
from xgboost import XGBClassifier
from xgboost import XGBRanker

model = XGBRanker()

model = XGBClassifier() #with custom objective (for personalized recommendation systems)

# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●

# Part 6

# Parametrization

param_grid: It's a dictionary that defines the hyperparameter grid to search over during the grid search. Hyperparameters are model parameters that are set before training and can significantly impact the model's performance. In this case, the grid specifies the values to try for two hyperparameters: n_estimators (number of boosting rounds) and learning_rate (step size shrinkage).

GridSearchCV: It's a class from scikit-learn used for hyperparameter tuning using grid search. It performs an exhaustive search over the hyperparameter grid specified in param_grid. For each combination of hyperparameters, it uses cross-validation (specified by cv) to evaluate the model's performance.

cv: It's the number of cross-validation folds used during grid search. In this case, cv=10, so it performs 10-fold cross-validation, meaning it splits the data into 10 subsets, trains the model on 9 of them, and validates on the remaining one.

scoring: It's the evaluation metric used to compare different hyperparameter combinations. In this case, scoring='neg_mean_squared_error', which means it will use the negative mean squared error (MSE) as the evaluation metric. By default, scikit-learn optimizes for higher values of the scoring metric, so using negative MSE allows it to minimize the mean squared error.

grid_search.fit(X_train_scaled, y_train): It performs the grid search with cross-validation using the training data (X_train_scaled and y_train). The best hyperparameters are determined based on the results of the cross-validation.

grid_search.best_params_: After the grid search is completed, this attribute gives the best hyperparameters found during the search.

grid_search.best_estimator_: It provides the best model obtained with the best hyperparameters found during grid search.

best_model.fit(X_train_scaled, y_train): It trains the best model on the entire training set using the best hyperparameters.


In [None]:
from sklearn.model_selection import GridSearchCV

#Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model on the entire training set
best_model.fit(X_train_scaled, y_train)

### Parameters

#### n_estimators: Number of boosting rounds (trees).

Typical Range: 50 to 1000 or more.

Higher values can improve the performance of the model but may increase the training time.


#### learning_rate: Step size shrinkage used to prevent overfitting.

Typical Range: 0.01 to 0.3 or lower.

Lower values require more boosting rounds (higher n_estimators) to achieve good performance.


#### max_depth: Maximum depth of the individual trees.

Typical Range: 3 to 10 or more.

Higher values can lead to more complex models, but be cautious of overfitting.


#### subsample: Fraction of samples used for fitting the individual trees.

Typical Range: 0.5 to 1.0.

Values less than 1.0 introduce stochasticity, which can help reduce overfitting.


#### colsample_bytree: Fraction of features used for fitting the individual trees.

Typical Range: 0.5 to 1.0.

Values less than 1.0 introduce stochasticity, which can help reduce overfitting.


# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●

# Part 7

# Evaluation of the model

### importance scores 

In [None]:
importance_scores = best_model.feature_importances_

# Create a DataFrame to associate feature names with their importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance_scores})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualize the feature importance scores
import matplotlib.pyplot as plt
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

In [None]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)

### Evaluating

#### R-squared (R2) Score: 
It measures the proportion of the variance in the dependent variable (target) that is predictable from the independent variables (features). It provides an indication of how well the model fits the data.

#### Mean Absolute Error (MAE):
It calculates the absolute difference between the actual and predicted values and then takes the average of those differences.

#### Root Mean Squared Error (RMSE):
It is the square root of the MSE and provides a more interpretable measure since it is in the same unit as the target variable.

#### Mean Squared Logarithmic Error (MSLE): 
It calculates the mean squared logarithmic difference between the actual and predicted values. It is useful when the target variable has a wide range of values.

#### Explained Variance Score:
It measures the proportion of variance in the target variable that the model explains. It is another indicator of how well the model fits the data.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score, mean_squared_log_error

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)
# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
# Calculate the root mean squared error
rmse = mean_squared_error(y_test, y_pred, squared=False)
# Calculate the mean squared logarithmic error
msle = mean_squared_log_error(y_test, y_pred)
# Calculate the explained variance score
evs = explained_variance_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Squared Logarithmic Error: {msle}")
print(f"Explained Variance Score: {evs}")

# ●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○●○● 

# Part 8

# Saving & loading the model

In [None]:
import xgboost as xgb

# Assuming you have already trained your model and it is stored in the 'model' variable (in our case best_model)
# Save the model to a file
best_model.save_model('xgboost_model.bin')

In [None]:
import xgboost as xgb

# Load the model from the file
loaded_model = xgb.Booster()
loaded_model.load_model('xgboost_model.bin')