Introduction To Scikit-Learn

In [None]:
#  reading the data and getting it ready 
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt



In [None]:
heart=pd.read_csv("heart.csv")
heart

In [None]:
#  creating a features matrix
x=heart.drop("target",axis=1)

#  creating a label matrix 
y=heart["target"]


What is a hyperparameter ?

* It is a fine tuning parameter for a working ml model eg, like adjusting the speed of a fan .

Choosing the right model and hyperparameters 


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
#  we are using the default hyperparameters of RandomForest()
# clf.get_params()

In [None]:
#  Fit the model to the training data
from sklearn.model_selection import train_test_split
x_train , x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
clf.fit(x_train,y_train);

In [None]:
#  predicting the data
y_preds=clf.predict(x_test)
y_preds

In [None]:
y_test

In [None]:
#  evaluating the model on training data 
clf.score(x_train,y_train)

In [None]:

clf.score(x_test,y_test) 

The RandomForestClassifier obtains a 88.5% accuracy in test data without changing any hyperparameters of RandomForestClassifier

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(y_test,y_preds))

In [None]:
confusion_matrix(y_test,y_preds)

In [None]:
accuracy_score(y_test,y_preds)

Improving the Model 

In [None]:
np.random.seed(42)
for i in range(10,100,10):
    print(f"trying model with {i} estimators \n")
    clf=RandomForestClassifier(n_estimators=i).fit(x_train,y_train)
    print(f"\n Model accuracy on test set:{clf.score(x_test,y_test)} ")

In [None]:
import pickle 
pickle.dump(clf,open("RandomForestClassifiermodel1.pkl","wb"))

In [None]:
load_model=pickle.load(open("RandomForestClassifiermodel1.pkl","rb"))
load_model.score(x_test,y_test)

Getting our data ready 

Three main things to be done 

* Splitting data into features and labels (usually `x` & `y`)

* Filling(imputing) our  missing data or to drop missing data

* Converting non-numerical values to numerical values (feature encoding)

In [None]:
heart.head()

In [None]:
x=heart.drop("target",axis=1)
x.head()

In [None]:
y=heart["target"]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
xtrain

Performing feature encoding on a data set

In [None]:
car=pd.read_csv("carext.csv")
car.head()
car


In [None]:
car.dtypes

In [None]:
x=car.drop("Price",axis=1)
y=car["Price"]

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.2)
xtest.head()

The below code throws an error as there are strings present on the dataset and a ml model can't work on a string hence an error is observed 

```
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor()
reg.fit(xtrain,ytrain)
reg.score(xtest,ytest)
```

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalfeatures=["Make","Colour","Doors"]
onehot=OneHotEncoder()
transformer=ColumnTransformer([("onehot",onehot,categoricalfeatures)],remainder="passthrough")
transformedx=transformer.fit_transform(x)
transformedx

In [None]:
pd.DataFrame(transformedx)

In [3]:
dummies=pd.get_dummies(car[["Make","Colour","Doors"]])
dummies

NameError: name 'car' is not defined

The  OneHotEncoder method and pd.get_dummies method give the same result however it is upto the coder which method he wants to use  

CamelCase --> first letter of every word is capital

In [None]:
dummies.head()

In [None]:
# refitting the model 
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()

np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(transformedx,y,test_size=0.2)
model.fit(xtrain,ytrain)
model.score(xtest,ytest)

the above model is able to predict to an accuracy of about 33 % which is not acceptable and needs the hyperparameters to be fine tuned 

Dealing with missing values in a data set. There are 2 ways:
* Filling the missing value of  data set with some value also known as imputation
* Removing the missing value altogether 

In [None]:
# reading the missing dataset 
carmiss=pd.read_csv("carmissext.csv")
carmiss.head()
carmiss

In [None]:
carmiss.isna().sum()

In [None]:
x=carmiss.drop("Price",axis=1)
y=carmiss["Price"]
# pd.Series(["honda","audi","bmw"])

As we are using sklearn version 1.2 which can handle Nan values by itself and does not raises an exception hence the code written in cell below the print works fine 

In [None]:
print(sklearn.__version__)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalfeatures=["Make","Colour","Doors"]
onehot=OneHotEncoder()
transformer=ColumnTransformer([("onehot",onehot,categoricalfeatures)],remainder="passthrough")
transformedx=transformer.fit_transform(x)
transformedx

In [None]:
carmiss.isna().sum()

Option 1: Filling the Missing data using Pandas

In [None]:
carmiss["Make"].fillna("missing",inplace=True)
carmiss["Colour"].fillna("missing",inplace=True)
carmiss["Doors"].fillna(4,inplace=True)
carmiss["Odometer (KM)"].fillna(carmiss["Odometer (KM)"].mean(),
inplace=True)
# carmiss["Price"].fillna(carmiss["Price"])
carmiss.isna().sum()

Removing Rows with missing price values 

In [None]:
carmiss.dropna(inplace=True)
carmiss.isna().sum()

In [None]:
len(carmiss)

In [None]:
x=carmiss.drop("Price",axis=1)
y=carmiss["Price"]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalfeatures=["Make","Colour","Doors"]
onehot=OneHotEncoder()
transformer=ColumnTransformer([("onehot",onehot,categoricalfeatures)],remainder="passthrough")
transformedx=transformer.fit_transform(x)
transformedx

Filling missing data with scikit learn

In [None]:
carmiss=pd.read_csv("carmiss2.csv")
carmiss.head()

In [None]:
carmiss.isna().sum()

In [None]:
carmiss.dropna(subset=["Price"],inplace=True)
carmiss.isna().sum()

In [None]:
x=carmiss.drop("Price",axis=1)
y=carmiss["Price"]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
#  cat stands for categorical 
#  num stands for numerical 

catimputer=SimpleImputer(strategy="constant",fill_value="missing")
doorimputer=SimpleImputer(strategy="constant",fill_value=4)
numimputer=SimpleImputer(strategy="mean")

catfeatures=["Make","Colour"]
doorfeatures=["Doors"]
numfeatures=["Odometer (KM)"]

imputr=ColumnTransformer([
    ("catimputer",catimputer,catfeatures),
    ("doorimputer",doorimputer,doorfeatures),
    ("numimputer",numimputer,numfeatures)
])
filledx=imputr.fit_transform(x)
filledx



In [None]:
carfilled=pd.DataFrame(filledx,columns=["make","colour","doors","odometer"])
carfilled


In [None]:
carfilled.isna().sum()

In [None]:
y

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalfeatures=["make","colour","doors"]
onehot=OneHotEncoder()
transformer=ColumnTransformer([("onehot",onehot,categoricalfeatures)],remainder="passthrough")
transformedx=transformer.fit_transform(carfilled)
transformedx

In [None]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(transformedx,y,test_size=0.2)

In [None]:
clf=RandomForestRegressor()
clf.fit(xtrain,ytrain)

In [None]:
clf.score(xtrain,ytrain)

the RandomForestRegressor used gives us an accuracy of about 22 % which is unacceptable

In [None]:
clf.score(xtest,ytest)

Choosing the righ model \ algorithm \ estimators:

Some things to note:

* sklearn refer ml models\ algorithms as estimators
* Classification problem - predicting a category (heart disease or not) 
    * `clf` is used for classifier as shortform
* Regression problem - predicting a number (price of a house)    

In [None]:
data=pd.read_csv("housing.csv")
data.head()
data=data.dropna()
data.isna().sum()

In [None]:
from sklearn.linear_model import Ridge
np.random.seed(42)
x=data.drop("median_house_value",axis=1)
y=data["median_house_value"]
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
model=Ridge()
model.fit(xtrain,ytrain)


In [None]:
model.score(xtrain,ytrain)

In [None]:
model.score(xtest,ytest)

In [None]:
model.score(xtest,ytest)

In [None]:
xtest

The Ridge model provides an accuracy of 64 % which is not acceptable 

#  Using RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
model= RandomForestRegressor()
model.fit(xtrain,ytrain)


In [None]:
model.score(xtest,ytest)

The RandomForestRegressor gives us an accuracy of about 83 % which is acceptable but can use some hyperparameter tuning to boost performance

# Classification Problem
On heart dataset

In [None]:
len(heart)

In [None]:
from sklearn.svm import LinearSVC

np.random.seed(42)
x=heart.drop(["target"],axis=1)
y=heart["target"]
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
clf=LinearSVC(max_iter=10000000)
clf.fit(x,y)
clf.score(xtrain,ytrain)

In [None]:
clf.score(xtest,ytest)

the LinearSVC model gives us an accuracy of around 73 % which needs to be improved after increasing the vlaue of max iter we get around 88 % accuracy which is acceptable 

# Using RandomForestClassifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
x=heart.drop(["target"],axis=1)
y=heart["target"]
np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
clf.fit(xtrain,ytrain)
clf.score(xtrain,ytrain)

In [None]:
clf.score(xtest,ytest)

we observed an overfitting of data when evaluating the score of training data where as we observe an accuracy of 85 % in RandomForestClassifier 

eventhough the accuracy of LinearSVC is more than RandomForestClassifier by about 3% the time taken to train the LinearSVC is about 1 min 30 sec where as that of RandomForestClassifier is about 0.7 s hence we would prefer the RandomForestClassifier model

# Making predictions with help of a ML model

In [None]:
y_preds=clf.predict(xtest)
np.mean(y_preds==ytest) # --> just a fancy way for using clf.score() function


In [None]:
clf.score(xtest,ytest)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest,y_preds)

# Making predictions with `predict_proba()`

In [None]:
clf.predict_proba(xtest[:5])

in the above output the `predict_proba()` returns an array of which the left column denotes the probablity of 0 value where as right column denotes the probablity of 1 value of target. The probablity can never be greater than 1 in any case 

In [None]:
clf.predict(xtest[:5])

In [None]:
data.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
model=RandomForestRegressor()
x=data.drop("median_house_value",axis=1)
y=data["median_house_value"]
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
model.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
ypreds=model.predict(xtest)



In [None]:
model.score(xtest,ytest)

In [None]:
# model.predict_proba(xtest[:5])

In [None]:
# ypreds=model.predict(xtest)

In [None]:
from sklearn.metrics import mean_absolute_error
ytest,ypreds


In [None]:
len(ypreds),len(ytest)

In [None]:
mean_absolute_error(ytest,ypreds)

In [None]:
ytest[:5]

# Evaluating an ML model :
Three ways to do :
* `score()`
* `scoring` parameter
* problem specific metric


* clf=RandomForestClassifier(n_estimators=100000000)


In [None]:
heart=pd.read_csv("heart.csv")
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
x= heart.drop(["target"],axis=1)
y=heart["target"]

In [None]:
x=x.dropna()
y=y.dropna()


In [None]:
np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
clf.fit(xtrain,ytrain)
clf.score(xtest,ytest)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf,x,y)

the `cross_val_score()` returns an array of score trained and tested on different portions of data  the greater the value of `cv` in function more we can clearly figure out the true potential of our model

In [None]:
cross_val_score(clf,x,y,cv=100)


In [None]:
clf.score(xtest,ytest),np.mean(cross_val_score(clf,x,y,cv=10))

The accuracy of the `clf` model is `np.mean(cross_val_score(clf,x,y,cv=10))` 

# area under reciever operating  charctersistic curve (AUC \ ROC)

* Simply it is the comparision between true positive rate (tpr) and false positive rate (fpr)

* True positive rate : model predicts `1` when target is `1`

* False positive rate : model predicts `1` when target is `0`

* True negative rate : model predicts `0` when target is `0`

* False negative rate : model predicts `0` when target is `1`


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)
clf.fit(xtrain,ytrain)



In [None]:
from sklearn.metrics import roc_curve
ypreds=clf.predict_proba(xtest)
ypreds[:10]

In [None]:
ypos=ypreds[:,1]
ypos[:10]

In [None]:
#  calculating tpr ,fpr & threshold
fpr,tpr,thrs=roc_curve(ytest,ypos)


In [None]:
import matplotlib.pyplot as plt
# plt.plot(fpr,tpr)
def PlotROCCurve(fpr,tpr):
    """ 
    This function plots the ROC curve if tpr and fpr is provided
    """
    #  plotting the model 
    plt.plot(fpr,tpr,color="green",label="ROC")

    # plotting the layman guess 

    plt.plot([0,1],[0,1],color="red",linestyle="--",label="Guessing")

    # customizing the graph
    plt.xlabel("false positive rate (fpr) ")
    plt.ylabel("true positve rate (tpr)")
    plt.title("Recviever Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()
    
PlotROCCurve(fpr,tpr)    



In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(ytest,ypos) # --> a score of 94.8 % is obtained 

The `roc_auc_score()` provides the area enclosed by the curve obtained in the `PlotROCCurve()` function 

In [None]:
fpr,tpr,thrs=roc_curve(ytest,ypos)

In [None]:
PlotROCCurve(fpr,tpr) # -->graph obtained is same as above

In [None]:
#  plotting the perfect ROC graph
fpr,tpr,thrs=roc_curve(ytest,ytest)

PlotROCCurve(fpr,tpr)

In [None]:
#  the above graph obtained has an `roc_auc_score()` of 1.0 i.e. it is perfect
roc_auc_score(ytest,ytest)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
ypreds=clf.predict(xtest)
confusion_matrix(ypreds,ytest)

The anti principal diagonal elements(off diagonal) of a `confusion_matrix` are `fpr`
and `fnr ` respectively

In [None]:
pd.crosstab(ytest,
ypreds,
rownames=["actual value"],
colnames=["predicted values"])


In [None]:
#  creating a visual confusion matrix with help of seaborn
import seaborn as sns

sns.set(font_scale=1.5)

cmat=confusion_matrix(ytest,ypreds)

sns.heatmap(cmat);

In [None]:
sklearn.__version__

In [None]:
clf

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf,X=x,y=y);

In [None]:
ConfusionMatrixDisplay.from_predictions(ytest,ypreds);

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypreds))

In [None]:
from sklearn.ensemble import RandomForestClassifier
housing=pd.read_csv("housing.csv")
x=housing.drop(housing["target"],axis=1)
y=housing["target"
housing.head()

40 8:36