In [2]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [8]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
# We can see it's a Multi-Class classification problem

data.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [11]:
data.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [4]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [13]:
# No Null Value

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [14]:
# Checking outliers

data.describe().T



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


## We need Standardize the data

Doing it because we have different-2 units

Check Link for more scalers, we may find better result using other scaler - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

In [15]:
# ALWAYS PERFORM TRAIN TEST SPLIT FIRST

X = data.drop('quality', axis=1)
y = data["quality"]

In [16]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [17]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

> ## Splitting before preprocessing

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
print(X_test.shape)
print(X_train.shape)

(528, 11)
(1071, 11)


> ## Preprocessing

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

> WE can directly use fit_transform as well!

In [22]:
scaler.fit(X_train)   #calculating the mean and std dev

In [24]:
# We got mean of all features

print(scaler.mean_)

[ 8.30345472  0.53246499  0.26933707  2.54691877  0.08772736 15.91223156
 46.76330532  0.99677933  3.31453782  0.65881419 10.41521942]


In [27]:
X_train_tf = scaler.transform(X_train)

> ## Now we will do model building  (Training then Testing)

In [28]:
from sklearn.svm import SVC

model = SVC()

In [29]:
model.fit(X_train_tf, y_train)

In [30]:
model.score(X_train_tf, y_train)

0.6778711484593838

> Model able to read only 67% data

In [34]:
X_test_tf = scaler.transform(X_test)

In [38]:
# PREDICTED VALUES

y_pred = model.predict(X_test_tf)
y_pred[:10]

array([5, 5, 6, 5, 6, 5, 5, 5, 6, 6], dtype=int64)

In [36]:
# ACTUAL VALUES

y_test[:10]

803     6
124     5
350     6
682     5
1326    6
976     5
1493    5
706     5
613     5
1587    6
Name: quality, dtype: int64

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [39]:
accuracy_score(y_test, y_pred)

0.5984848484848485

> ## We can increase accuracy using other techniques

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
model2 = LogisticRegression()

In [43]:
model2.fit(X_train_tf, y_train)

In [46]:
y_pred2 = model2.predict(X_test_tf)

In [47]:
accuracy_score(y_test, y_pred2)

0.571969696969697

# Mathematics is important before applying any technique



https://raw.githubusercontent.com/srinivasav22/Graduate-Admission-Prediction/master/Admission_Predict_Ver1.1.csv

Create SVR Model
Get Accuracy score using R Square
Hyperparameter Tuning using GridSearch CV