## ---> _Data Preparation_

### Importing required libraries

In [15]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from numpy import mean,std

### Generate data

In [3]:
X, y = make_classification(n_samples=1000, n_features= 20, n_informative=15,n_redundant=5, random_state=123)

### Do a train test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Initialize standardizing object

In [5]:
scaler = MinMaxScaler()

### Initialize model object

In [6]:
logModel = LogisticRegression()

### Create steps object

In [7]:
steps = []
steps.append(("minmaxscaler",scaler))
steps.append(("model",logModel))

### Initialize pipeline

In [9]:
pipe = Pipeline(steps)

### Initialize Kfold

In [8]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=123)

### Fit the model

In [13]:
scores = cross_val_score(pipe, X_train, y_train, cv=rskf, n_jobs=-1)

### Print out the mean and std of scores

In [17]:
print("Mean of accuracy is : %s"%(mean(scores)))
print("Standard deviation of accuracy is : %s"%(std(scores)))

Mean of accuracy is : 0.8029166666666667
Standard deviation of accuracy is : 0.03286388172785166


## ---> _Data Cleaning_

### Load required libraries

In [18]:
import pandas as pd

#### import dataset

In [34]:
df = pd.read_csv("./Datasets/oil-spill-dataset/oil-spill.csv", header = None).reset_index(drop=True)

#### find out the unique values in each column

In [35]:
counts = df.nunique()

#### filter out the column indexes which have only one unique value

In [40]:
to_del = [i for i,v in enumerate(counts) if v==1]
print(to_del)

[22]


#### Drop that column which has only one unique value

In [41]:
print(df.shape)
df.drop(to_del, axis=1, inplace=True)
print(df.shape)

(937, 50)
(937, 49)
