# Classify Raisins with Hyperparameter Tuning Project

- [View Solution Notebook](./solution.html)
- [View Project Page](https://www.codecademy.com/projects/practice/mle-hyperparameter-tuning-project)

### 1. Explore the Dataset

In [1]:
# 1. Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Raisin_Dataset.csv'

In [None]:
# 2. Create predictor and target variables, X and y
X = raisins.drop('Class', axis=1)
y = raisins['Class']

In [None]:
# 3. Examine the dataset
print("Number of features:", X.shape[1])
print("Total number of samples:", len(y))
print("Samples belonging to class '1':", y.sum())

Number of features: 7
Total number of samples: 900
Samples belonging to class '1': 450


In [None]:
# 4. Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 19)

### 2. Grid Search with Decision Tree Classifier

In [None]:
# 5. Create a Decision Tree model
tree = DecisionTreeClassifier()

In [None]:
# 6. Dictionary of parameters for GridSearchCV
parameters = {'min_samples_split': [2,3,4], 'max_depth': [3,5,7]}

In [None]:
# 7. Create a GridSearchCV model
grid = GridSearchCV(tree, parameters)

# Fit the GridSearchCV model to the training data
grid.fit(X_train, y_train)

In [None]:
# 8. Print the model and hyperparameters obtained by GridSearchCV
print(grid.best_estimator_)

# Print best score
print(grid.best_score_)
# Print the accuracy of the final model on the test data
print(grid.score(X_test, y_test))

DecisionTreeClassifier(max_depth=5, min_samples_split=3)
0.8666666666666668
0.8133333333333334


In [None]:
# 9. Print a table summarizing the results of GridSearchCV
df = pd.concat([pd.DataFrame(grid.cv_results_['params']), pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Score'])], axis=1)
print(df)

   max_depth  min_samples_split     Score
0          3                  2  0.860741
1          3                  3  0.860741
2          3                  4  0.860741
3          5                  2  0.863704
4          5                  3  0.866667
5          5                  4  0.862222
6          7                  2  0.848889
7          7                  3  0.844444
8          7                  4  0.851852


### 2. Random Search with Logistic Regression

In [None]:
# 10. The logistic regression model
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000)

In [None]:
# 11. Define distributions to choose hyperparameters from
from scipy.stats import uniform
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}

In [None]:
# 12. Create a RandomizedSearchCV model
clf = RandomizedSearchCV(lr, distributions, n_iter=8)

# Fit the random search model
clf.fit(X_train, y_train)

In [None]:
# 13. Print best esimator and best score
print(clf.best_estimator_)
print (clf.best_score_)

# Print a table summarizing the results of RandomSearchCV
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Accuracy'])] ,axis=1)
print(df.sort_values('Accuracy', ascending = False))