# Machine Learning Models 🤖

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
train = pd.read_csv('../output/fam_df.csv', index_col=0)
train.head()

Unnamed: 0,F. Transform,Tag
0,18001.0,1
1,2077.438012,1
2,4324.447678,1
3,7462.545756,1
4,8746.229687,1


In [5]:
X = train.drop(columns=["Tag"])
y = train["Tag"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
models = {
    "forest200": RandomForestClassifier(n_estimators=200),
    "boosting_950": GradientBoostingClassifier(),
    "extra_300":ExtraTreesClassifier(n_estimators=300),
    "hist": HistGradientBoostingClassifier(),
    "decision_tree": DecisionTreeClassifier(),
}

In [7]:
for name, model  in models.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training forest200...
Training complete
Starting training
Training boosting_950...
Training complete
Starting training
Training extra_300...
Training complete
Starting training
Training hist...
Training complete
Starting training
Training decision_tree...
Training complete


In [12]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in models.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))
    printMetric("Accuracy",accuracy_score(y_test, y_pred))
    printMetric("Precision",precision_score(y_test, y_pred, average='weighted'))
    printMetric("Recall",recall_score(y_test, y_pred, average='weighted'))
    printMetric("F1Score",f1_score(y_test, y_pred, average='weighted'))

Evaluating model forest200
	 RMSE: 0.4701
	 Accuracy: 0.8986
	 Precision: 0.8986
	 Recall: 0.8986
	 F1Score: 0.8986
Evaluating model boosting_950
	 RMSE: 0.8972
	 Accuracy: 0.5883
	 Precision: 0.5872
	 Recall: 0.5883
	 F1Score: 0.5871
Evaluating model extra_300
	 RMSE: 0.4708
	 Accuracy: 0.8983
	 Precision: 0.8983
	 Recall: 0.8983
	 F1Score: 0.8983
Evaluating model hist
	 RMSE: 0.9
	 Accuracy: 0.5874
	 Precision: 0.5863
	 Recall: 0.5874
	 F1Score: 0.5863
Evaluating model decision_tree
	 RMSE: 0.47
	 Accuracy: 0.8987
	 Precision: 0.8986
	 Recall: 0.8987
	 F1Score: 0.8986


## Making Predictions

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
forest200_model = linear_model.LinearRegression()

In [None]:
fitted_model = auto_model.fit(X_train, y_train)
print("intercept", fitted_model.intercept_) 
print("coefficients", fitted_model.coef_)

In [None]:
x = fitted_model.predict(X_test)

In [None]:
y_test

In [None]:
import pandas as pd 

predictionAuto_df = pd.DataFrame({'prediction': x,
                                'ground_truth': y_test,
                             'diff': x-y_test })

predictionAuto_df.head()