In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
#scikit-learn includes various random sample generators that can be used to build
#artificial datasets of controlled size and complexity

make_blobs create multiclass datasets by allocating each class one or more normally-distributed clusters of points.
make_blobs provides greater control regarding the centers and standard deviations of each cluster,and is used to demonstrate clustering.

In [None]:
help(make_blobs)

In [None]:
X, y = make_blobs(n_samples=10, centers=2, n_features=2,random_state=0)

In [None]:
X,y = make_blobs(n_samples = 50,centers = 2,random_state = 0,cluster_std = 0.60)
plt.scatter(X[:,0],X[:,1],c=y,s=50,cmap = 'autumn')

#s : scalar or array_like shape(n,),optional
#The marker size in points **2.
#c : color,sequence,or sequence of color,optional
#size matching both x and y.
#A 2D array in which the rows are RGB or RGBA

A linear discriminative classifier would attempt to draw a straight line separating the two sets of dataand therby create a model for classification.

xfit = np.linspace(-1,3.5)
plt.scatter(X[:,0],X[:,1],c = y,s = 50,cmap = 'autumn')
plt.plot([0.6],[2.1],'x',color = 'r',markeredgewidth = 2,markersize = 10)
for m,b in [(1,0.6),(0.5,1.6),(-0.2,2.9)]:
    plt.plot(xfit,m*xfit+b,'k')
plt.xlim(-1,3.5)

These are three very different separators which,nevertheless perfectly discriminate between these samples.Depending on which you choose, a new data point(e.g.,the one marked as 'x' in this plot)will be assigned a different label.Evidently our simple intuition of "drawing a line between classes".

#Maximizing the margin support vector machines offer one way to improve on this.The intuition is to rather than simply drawing a zero-width line between the classes,we can draw around each line a margin of width,up to the nearest point.Here is an example of it.

In [None]:
xfit = np.linspace(-1,3.5)
plt.scatter(X[:,0],X[:,1],c = y,s = 50,cmap = 'autumn')

for m,b,d in [(1,0.65,0.33),(0.5,1.6,0.55),(-0.2,2.9,0.2)]:
    yfit= m*xfit + b
    plt.plot(xfit,yfit,'k')
    plt.fill_between(xfit,yfit-d,yfit+d,color ='b',alpha = 0.4)
plt.xlim(-1,3.5)
#Fill the area in between two hoarizontal curves
#this creates one or multiple polygons describing the filling area

Let's see the result of an actual fit to this data:we will use Scikit-learn's Support Vector Classifier to train an SVM model on this data.We will use linear kernel and set the C parameter to default value.

In [None]:
from sklearn.svm import SVC
model = SVC(kernel = 'linear',C = 1)
model.fit(X,y)

In [None]:
#plot the decision function
def plot_svc_decision_function(model,ax=None,plot_support=True):
    """Plot the decision function for a 2D SVC"""
    if ax is None:
        ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
#gca means ge current get current axis
#current here means it provides a handle to the last active axes.
    #create grid to evalute the model
    x = np.linspace(xlim[0],xlim[1],30)
    y = np.linspace(ylim[0],ylim[1],30)
    Y,X = np.meshgrid(y,x)
    xy = np.vstack([X.ravel(),Y.ravel()]).T
    P= model.decision_function(xy).reshape(X.shape)
    
    #plot the decision boundary and margins
    ax.contour(X,Y,P,colors = 'k',levels = [-1,0,1],alpha = 0.5,
               linestyles = ['--','-','--'])
    
    #plot support vectors
    if plot_support:
        ax.scatter(model.support_vectors_[:,0],
                  model.support_vectors_[:,1],
                  s = 300,linewidth = 5,facecolors = None)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

In [None]:
plt.scatter(X[:,0],X[:,1],c=y,s=50,cmap='autumn')
plot_svc_decision_function(model);

Two datasets are included,realted to red and white Vinho verder wine samples,from north of Portugal.The goal is to model wine quality based on physiochemical tests

#fixed acidity :most acids involved with wine or fixed or nonvolatile (do not evaporate readily)

#volatile acidity: the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste

#citric acid:found in small quantities, citric acid can add 'freshness' and flavor to wines

#residual sugar: the amount of sugar remaining after fermentation stops, it's rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet

#chlorides:the amount of salt in the wine

#free sulfur dioxide:the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine

#total sulfur dioxide:amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine

#density: the density of water is close to that of water depending on the percent alcohol and sugar content

#pH:describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale

#sulphates: a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant

#alcohol:the percent alcohol content of the wine

#quality: output variable (based on sensory data, score between 0 and 10)

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
d = pd.read_csv("winequality.csv")
d.head()

In [None]:
d.info()

In [None]:
d.isnull().sum()

In [None]:
#quality vs sulphates
sns.barplot(x='quality',y='sulphates',data=d)

In [None]:
#quality vs volatile acidity
sns.barplot(x='quality',y='volatile acidity',data=d)

In [None]:
#quality vs alcohol
sns.barplot(x='quality',y='alcohol',data=d)

In [None]:
#showing the counts before categorize the quality column
d['quality'].value_counts()

In [None]:
#categorize win quality
bins =(2,6.5,8)
group_names = ['bad','good']
categories = pd.cut(d['quality'],bins,labels = group_names)
#Bin values into discrete intervals
#Use cut function to segment and sort the data values into bins
#Useful for going from a continuous variable to a categorical variable
d['quality'] =categories

In [None]:
#after categorize
d['quality'].value_counts()

In [None]:
#quality vs alcohol
#more alcohol better red wine
sns.barplot(x = 'quality',y='alcohol',data = d)

In [None]:
#quality vs volatile acidity
#less volatile,better red wine
sns.barplot(x='quality',y='volatile acidity',data=d)

In [None]:
#splitting data to X and y
X = d.drop(['quality'],axis = 1)
y = d['quality']

In [None]:
y

In [None]:
#Encoding or dependent variable:quality column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [None]:
y

In [None]:
#Splitting the dataset into training set and testing set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2,
                                                 random_state=0)

In [None]:
#feature scaling to X_train and X_test to classify better
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Fitting kernel svm to the training set
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix
classifier = SVC(kernel = 'rbf',random_state = 0)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_pred,y_test)

In [None]:
#kfold cross validation for improving the model
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier,X=X_train,
                            y=y_train,cv = 100)
#cv int cross-validation generator or an iterable,optional
#Determines the cross-validation splitting strategy
#None,to use default -5-fold cross validation
#CV splitter,
#An iterable yielding (train, test) splits as arrays of indices.
#we can see model's average accuracy
accuracies.mean()

In [None]:
#GridSearch for best model and parameters
#Exhaustive search over specified parameter values for an estimator
#Important members are fit and predict
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100,1000],'kernel':['linear']},
             {'C':[1,10,100,1000],'kernel':['rbf'],
             'gamma':[0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search.fit(X_train,y_train)
best_accuracy=grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
best_parameters

In [None]:
best_accuracy

In [None]:
# Fitting Kernel SVM to the Training set with best parameters
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0, gamma = 0.9)
classifier.fit(X_train, y_train)

#Predicting the Test Set
y_pred = classifier.predict(X_test)
accuracy_score(y_pred,y_test)