# Cross Validation

* _Leave one out CV ( LOOCV )_
    * complexity increasing ( nower used)
    * overfiting
* _Leave P out CV ( LPOCV )_ - P value will be given
* _Key Fold CV_ - k times data will be divided and sent to the validation
* _Stratified K fold CV_ - equal amount of 0 and 1 are present (Binary classification) and folows key fold CV
* _Time series CV_ - Time Series Application
    * Product sentiment analysis

### GridSearchCV , RandamizedSearchCV

In [31]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report,recall_score,roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### creating a dataset

In [21]:
x,y = make_classification(n_samples=1000,n_classes=2,random_state=40,n_features=5)
X = pd.DataFrame(data=x)
X

Unnamed: 0,0,1,2,3,4
0,0.370111,-1.037805,1.608539,-0.392693,-0.360583
1,-0.143981,0.018132,-0.891866,0.533915,0.448818
2,-0.505874,-0.249010,-1.916002,0.131998,0.165209
3,0.244301,-0.264726,1.342619,-0.661488,-0.563662
4,0.075090,0.113369,-0.318921,0.844558,0.675016
...,...,...,...,...,...
995,1.013373,-0.239873,2.857226,1.140579,0.806412
996,-0.641775,0.577578,-0.525285,-2.561730,-1.999714
997,-0.123521,0.561108,-0.928851,0.692547,0.574872
998,-0.120211,-0.110192,-1.501752,1.530211,1.252588


### Train and test splits

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=32)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(670, 5) (330, 5) (670,) (330,)


### Creating model , creating params

In [23]:
logistic = LogisticRegression()
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
c_values = [0.0001,0.001,0.01,0.1,10,50,100,500,1000]
penalty = ['l1','l2','elasticnet']
class_weights = [{0:w , 1:y} for w in [0,1,10,20,30,50,70,100] for y in [0,1,10,20,30,50,70,100]]

In [24]:
params = dict(penalty=penalty,C=c_values,solver=solver,class_weight=class_weights)


### Hyperpartameter Tunning

* GRIDSearchCV

In [25]:
#Building Model 
grid_model = GridSearchCV(estimator=logistic,param_grid=params,n_jobs=-1,cv=StratifiedKFold(),scoring='accuracy')
grid_model

In [26]:
grid_model.fit(x_train,y_train)

In [27]:
grid_model.best_params_

{'C': 0.0001,
 'class_weight': {0: 100, 1: 100},
 'penalty': 'l2',
 'solver': 'lbfgs'}

In [29]:
grid_model.best_score_

np.float64(0.8835820895522388)

In [28]:
y_pred = grid_model.predict(x_test)

In [32]:
score = accuracy_score(y_test,y_pred)
print(f"Accuracy Score : {score}")
print()
con_mat = confusion_matrix(y_test,y_pred)
print(f"confusion Matrix : \n \n {con_mat}")
print()
clas_rep = classification_report(y_test , y_pred)
print(f"Classification Report : \n \n {clas_rep}")

Accuracy Score : 0.8939393939393939

confusion Matrix : 
 
 [[150  12]
 [ 23 145]]

Classification Report : 
 
               precision    recall  f1-score   support

           0       0.87      0.93      0.90       162
           1       0.92      0.86      0.89       168

    accuracy                           0.89       330
   macro avg       0.90      0.89      0.89       330
weighted avg       0.90      0.89      0.89       330



### Randomized Search CV

In [33]:
logistic = LogisticRegression()
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
c_values = [0.0001,0.001,0.01,0.1,10,50,100,500,1000]
penalty = ['l1','l2','elasticnet']
class_weights = [{0:w , 1:y} for w in [0,1,10,20,30,50,70,100] for y in [0,1,10,20,30,50,70,100]]

In [34]:
params = dict(penalty=penalty,C=c_values,solver=solver,class_weight=class_weights)

In [36]:
randomCV = RandomizedSearchCV(estimator=logistic,param_distributions=params,cv=5,scoring='accuracy')
randomCV

In [37]:
randomCV.fit(x_train,y_train)

In [38]:
randomCV.best_params_

{'solver': 'lbfgs', 'penalty': 'l2', 'class_weight': {0: 10, 1: 20}, 'C': 500}

In [39]:
randomCV.best_score_

np.float64(0.8567164179104477)

In [40]:
y_pred = randomCV.predict(x_test)

In [41]:
score = accuracy_score(y_test,y_pred)
print(f"Accuracy Score : {score}")
print()
con_mat = confusion_matrix(y_test,y_pred)
print(f"confusion Matrix : \n \n {con_mat}")
print()
clas_rep = classification_report(y_test , y_pred)
print(f"Classification Report : \n \n {clas_rep}")

Accuracy Score : 0.8666666666666667

confusion Matrix : 
 
 [[129  33]
 [ 11 157]]

Classification Report : 
 
               precision    recall  f1-score   support

           0       0.92      0.80      0.85       162
           1       0.83      0.93      0.88       168

    accuracy                           0.87       330
   macro avg       0.87      0.87      0.87       330
weighted avg       0.87      0.87      0.87       330

