In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats
from wrangle import wrangle_data
import modeling as m

#Tools to build machine learning models and reports
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")

In [2]:
random_seed=1969

In [3]:
df = wrangle_data()

In [4]:
df.columns

Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Target', 'Temp Delta [K]'],
      dtype='object')

In [5]:
dumb_cols = ['Type']
df = pd.get_dummies(df, columns = dumb_cols)

In [6]:
df = df.drop(columns=['UDI', 'Product ID'])

In [7]:
target_col = 'Target'

In [8]:
train, val, test = m.train_validate(df, stratify_col = target_col)

In [9]:
train.shape[0] + val.shape[0] + test.shape[0] == df.shape[0]

True

In [10]:
cont_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Temp Delta [K]']

In [11]:
train_scaled, val_scaled, test_scaled = m.scale_cont_columns(train, val, test, cont_columns, scaler_model = 1)

In [12]:
X_train, y_train, X_val, y_val, X_test, y_test = m.train_val_test(train_scaled, val_scaled, test_scaled, target_col)

In [13]:
#Still guess the most prevelent class. Then calc recall for zero.

baseline = train['Target'].sum()/len(train['Target'])
baseline

0.03395833333333333

In [14]:
m.dec_tree(X_train, y_train, X_val, y_val, metric = 3)

Recall of Decision Tree classifier on training set:   0.7423
Recall of Decision Tree classifier on validation set: 0.6019


In [15]:
m.rand_forest(X_train, y_train, X_val, y_val, metric = 3)

Recall of Random Forest classifier on training set:   0.6258
Recall of Random Forest classifier on validation set: 0.4722


In [16]:
m.knn_mod(X_train, y_train, X_val, y_val, metric = 3)

Recall of KNN classifier on training set:   0.3374
Recall of KNN classifier on validation set: 0.2500


In [17]:
m.lr_mod(X_train, y_train, X_val, y_val, metric = 3)

Recall of Logistic Regression classifier on training set:   0.0123
Recall of Logistic Regression classifier on validation set: 0.0093


### Tuning the algrothium paramaters

In [None]:
metrics = []

for i in range(1, 25):
    trees = DecisionTreeClassifier(max_depth=i, random_state=1969)
    trees = trees.fit(X_train, y_train)
        
    y_pred = trees.predict(X_train)
    y_pred_val = trees.predict(X_val)

    in_sample_accuracy = recall_score(y_train, y_pred)
    out_of_sample_accuracy = recall_score(y_val, y_pred_val)

    output = {"max_depth" : i, "train_accuracy": in_sample_accuracy, "validate_accuracy": out_of_sample_accuracy}
              
    metrics.append(output)

df_tune = pd.DataFrame(metrics)              
df_tune['difference'] = df_tune.train_accuracy - df_tune.validate_accuracy              

In [None]:
y_pred = trees.predict(X_train)
y_pred_val = trees.predict(X_val)

In [None]:
print(classification_report(y_val, y_pred_val))

In [None]:

plt.figure(figsize=(12,6))
plt.plot(df_tune.max_depth, df_tune.train_accuracy, marker = 'o', label= 'Train')
plt.plot(df_tune.max_depth, df_tune.validate_accuracy, marker = 'o', label= 'Validate')
plt.legend()
plt.title("Decision Tree scores while adjusting max_depth")
plt.show()


In [None]:
metrics = []

for i in range(1, 20):
    rf_5 = RandomForestClassifier(max_depth=i, min_samples_leaf = 5, random_state= 1969)
    rf_5 = rf_5.fit(X_train, y_train)
    
    
    y_pred = rf_5.predict(X_train)
    y_pred_val = rf_5.predict(X_val)

    in_sample_accuracy = recall_score(y_train, y_pred)
    out_of_sample_accuracy = recall_score(y_val, y_pred_val)

    output = {"max_depth" : i, "train_accuracy": in_sample_accuracy, "validate_accuracy": out_of_sample_accuracy}
              
    metrics.append(output)

rand_forest = pd.DataFrame(metrics)              
rand_forest['difference'] = rand_forest.train_accuracy - rand_forest.validate_accuracy 

In [None]:

plt.figure(figsize=(12,6))
plt.plot(rand_forest.max_depth, rand_forest.train_accuracy, marker = 'o', label= 'Train')
plt.plot(rand_forest.max_depth, rand_forest.validate_accuracy, marker = 'o', label= 'Validate')
#plt.plot(rand_forest.max_depth, rand_forest.difference, marker = '.', label = 'Difference')

plt.legend()
plt.title("Random Forest adjusting max_depth")
plt.xlabel("max_depth number")
plt.ylabel("Accuracy")
plt.show()

In [None]:
metrics = []

for i in range(1, 21, 2): #Remember not to use even numbers to prevent gridlock.
    knn_5 = KNeighborsClassifier(n_neighbors=i)
    knn_5 = knn_5.fit(X_train, y_train)
    
    
    y_pred = knn_5.predict(X_train)
    y_pred_val = knn_5.predict(X_val)

    in_sample_accuracy = recall_score(y_train, y_pred)
    out_of_sample_accuracy = recall_score(y_val, y_pred_val)

    output = {"max_depth" : i, "train_accuracy": in_sample_accuracy, "validate_accuracy": out_of_sample_accuracy}
              
    metrics.append(output)

df_tune = pd.DataFrame(metrics)              
df_tune['difference'] = df_tune.train_accuracy - df_tune.validate_accuracy 

In [None]:

plt.figure(figsize=(12,6))
plt.plot(df_tune.max_depth, df_tune.train_accuracy, marker = 'o', label= 'Train')
plt.plot(df_tune.max_depth, df_tune.validate_accuracy, marker = 'o', label= 'Validate')
#plt.plot(df_tune.max_depth, df_tune.difference, marker = '.', label = 'Difference')
plt.legend()
plt.title("KNN using from 1 - 21 nearest neighbors")
plt.xlabel("Number of n_neighbors")
plt.ylabel("Accuracy")
plt.show()

‘lbfgs’ - [‘l2’, None]

‘liblinear’ - [‘l1’, ‘l2’]

‘newton-cg’ - [‘l2’, None]

‘newton-cholesky’ - [‘l2’, None] #Apparently this doesn't exist

‘sag’ - [‘l2’, None]

‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, None]

In [None]:
solver_list = ['lbfgs', 'newton-cg', 'sag', 'liblinear', 'saga']
metrics = []

for solver in solver_list:
    if (solver == 'lbfgs') or (solver == 'newton-cg') or (solver == 'sag'):
        penalties = ['l2', 'none']
    elif solver == 'liblinear':
        penalties = ['l1', 'l2']
    elif solver == 'saga':
        penalties = ['elasticnet', 'l1', 'l2', 'none']
    
    for penalty in penalties:
        if penalty == 'elasticnet':
            l1_ratio = 0.5
        else:
            l1_ratio = None
        logit = LogisticRegression(penalty = penalty, solver=solver, l1_ratio = l1_ratio, random_state= 1969)
        logit.fit(X_train, y_train)
        
        
        y_pred = logit.predict(X_train)
        y_pred_val = logit.predict(X_val)

        in_sample_accuracy = recall_score(y_train, y_pred)
        out_of_sample_accuracy = recall_score(y_val, y_pred_val)
        
        output = {"solver_penalty" : f'{solver}_{penalty}',
                  "train_accuracy": in_sample_accuracy,
                  "validate_accuracy": out_of_sample_accuracy}
              
        metrics.append(output)

logit_df = pd.DataFrame(metrics)              
logit_df['difference'] = logit_df.train_accuracy - logit_df.validate_accuracy 

In [None]:
plt.figure(figsize=(12,6))
plt.plot(logit_df.solver_penalty, logit_df.train_accuracy, marker = 'o', label= 'Train')
plt.plot(logit_df.solver_penalty, logit_df.validate_accuracy, marker = 'o', label= 'Validate')
#plt.plot(logit_df.solver_penalty, logit_df.difference, marker = '.', label = 'Difference')

plt.legend()
plt.title("Logistic Regression Solvers and penalties")
plt.xlabel("Solver and Penalty")
plt.xticks(rotation = 45)
plt.ylabel("Recall")
plt.show()

In [None]:
#Up sampling
#Class weights