# Red wine quality

In [1]:
import pandas as pd
import numpy as np

#load the data
df = pd.io.parsers.read_csv(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',
     header=0,
     sep=';'
    )

RedWine=df[0:10]
RedRest=df[10:]
RedWine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


# Red wine: min-max normalized values, z-score normalized values, mean substracted normalized value

In [2]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing
cols = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']
std_scale = preprocessing.StandardScaler().fit(RedWine[cols])
RedWine_std = std_scale.transform(RedWine[cols])

#min-max normalized values, z-score normalized values
minmax_scale = preprocessing.MinMaxScaler().fit(RedWine[cols])
RedWine_minmax = minmax_scale.transform(RedWine[cols])
print('minmax',RedWine_minmax)
print('z-score',RedWine_std)

minmax [[0.02564103 0.7        0.         0.14285714 0.33333333 0.125
  0.19047619 0.94117647 1.         0.29411765 0.         0.        ]
 [0.12820513 1.         0.         0.28571429 1.         1.
  0.58333333 0.64705882 0.11428571 0.64705882 0.36363636 0.        ]
 [0.12820513 0.8        0.07142857 0.2244898  0.81818182 0.375
  0.42857143 0.70588235 0.28571429 0.55882353 0.36363636 0.        ]
 [1.         0.         1.         0.14285714 0.3030303  0.5
  0.5        1.         0.         0.35294118 0.36363636 0.5       ]
 [0.02564103 0.7        0.         0.14285714 0.33333333 0.125
  0.19047619 0.94117647 1.         0.29411765 0.         0.        ]
 [0.02564103 0.63333333 0.         0.12244898 0.3030303  0.25
  0.26190476 0.94117647 1.         0.29411765 0.         0.        ]
 [0.15384615 0.53333333 0.10714286 0.08163265 0.12121212 0.375
  0.48809524 0.52941176 0.4        0.         0.         0.        ]
 [0.         0.61666667 0.         0.         0.         0.375
  0.03571429

In [3]:
#mean substracted normalized values
diff = RedWine-RedWine.mean()
diff.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.55,0.069,-0.104,-0.43,-0.001,-3.8,-14.9,0.00072,0.155,-0.029,-0.3,-0.5
1,-0.15,0.249,-0.104,0.27,0.021,10.2,18.1,-0.00028,-0.155,0.091,0.1,-0.5
2,-0.15,0.129,-0.064,-0.03,0.015,0.2,5.1,-8e-05,-0.095,0.061,0.1,-0.5
3,3.25,-0.351,0.456,-0.43,-0.002,2.2,11.1,0.00092,-0.195,-0.009,0.1,0.5
4,-0.55,0.069,-0.104,-0.43,-0.001,-3.8,-14.9,0.00072,0.155,-0.029,-0.3,-0.5


# Red wine: manhatten distance, euclidean distance, cosine distance

In [4]:
from sklearn.metrics.pairwise import euclidean_distances
#convert dataframe to numpy array
#df first 10
np_10=RedWine.values
#after 10
np_after10=RedRest.values

import pandas as pd
index = range(10)
columns = ['Nearest','Farthest']

euclidean10=pd.DataFrame(index=index,columns=columns)
for i in range(10):
    dist=euclidean_distances(np_10[i:i+1], np_after10)
    #convert to list and find the index of the min and max
    min_index=dist.argsort()[0][0:10].tolist()
    max_index=dist.argsort()[0][-10:].tolist()
    #Index.origin = Index.sort + 10
    min_index=[x+10 for x in min_index]
    max_index=[x+10 for x in max_index]
    # add index to the dataframe
    euclidean10.at[i,'Nearest']=min_index
    euclidean10.at[i,'Farthest']=max_index
    #print(min_index, max_index)
    

In [5]:
euclidean10

Unnamed: 0,Nearest,Farthest
0,"[123, 262, 1379, 1380, 1012, 959, 715, 62, 849...","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
1,"[752, 1314, 1173, 1174, 1357, 81, 796, 68, 130...","[672, 515, 15, 684, 109, 651, 354, 1244, 1079,..."
2,"[196, 224, 58, 1361, 63, 17, 121, 118, 19, 1352]","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
3,"[788, 787, 232, 692, 1393, 737, 424, 422, 140,...","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
4,"[123, 262, 1379, 1380, 1012, 959, 715, 62, 849...","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
5,"[686, 847, 1312, 931, 933, 1328, 66, 1463, 217...","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
6,"[1502, 424, 422, 1352, 1353, 1183, 12, 1230, 1...","[14, 515, 684, 15, 109, 651, 354, 1244, 1079, ..."
7,"[1143, 1116, 1117, 1115, 1554, 1557, 1578, 867...","[672, 515, 15, 684, 109, 651, 354, 1244, 1079,..."
8,"[69, 762, 1451, 1530, 983, 980, 742, 1455, 29,...","[672, 515, 684, 15, 109, 651, 354, 1244, 1079,..."
9,"[11, 166, 587, 1397, 621, 1445, 1441, 138, 311...","[1060, 813, 1014, 1287, 986, 915, 984, 979, 10..."


In [6]:
from sklearn.metrics.pairwise import manhattan_distances
manhattan10=pd.DataFrame(index=index,columns=columns)
for i in range(10):
    dist2=manhattan_distances(np_10[i:i+1], np_after10)
    #convert to list and find the index
    min_index=dist2.argsort()[0][0:10].tolist()
    max_index=dist2.argsort()[0][-10:].tolist()
    min_index=[x+10 for x in min_index]
    max_index=[x+10 for x in max_index]
    manhattan10.at[i,'Nearest']=min_index
    manhattan10.at[i,'Farthest']=max_index
    #print(min_index, max_index)

In [7]:
manhattan10

Unnamed: 0,Nearest,Farthest
0,"[123, 262, 1012, 28, 1379, 1380, 959, 62, 715,...","[1131, 651, 396, 400, 14, 15, 354, 1244, 1079,..."
1,"[752, 1357, 796, 81, 1314, 1174, 1173, 1304, 4...","[651, 1131, 396, 400, 14, 15, 354, 1244, 1079,..."
2,"[196, 58, 63, 1361, 224, 1353, 1352, 246, 17, ...","[651, 1131, 396, 400, 14, 15, 354, 1244, 1079,..."
3,"[787, 788, 232, 84, 1502, 1393, 402, 692, 19, ...","[651, 1131, 14, 396, 400, 15, 354, 1244, 1079,..."
4,"[123, 262, 1012, 28, 1379, 1380, 959, 62, 715,...","[1131, 651, 396, 400, 14, 15, 354, 1244, 1079,..."
5,"[686, 931, 933, 1328, 847, 66, 237, 1312, 217,...","[1131, 651, 396, 400, 14, 15, 354, 1244, 1079,..."
6,"[1502, 422, 424, 12, 1183, 1353, 1352, 1393, 1...","[1131, 651, 400, 396, 14, 15, 354, 1244, 1079,..."
7,"[1117, 1116, 1115, 1555, 1143, 1527, 1554, 155...","[1131, 651, 400, 396, 14, 15, 354, 1244, 1079,..."
8,"[742, 1024, 1029, 29, 1530, 1236, 69, 171, 172...","[651, 1131, 396, 400, 14, 15, 354, 1244, 1079,..."
9,"[11, 587, 1397, 166, 1445, 1441, 621, 124, 138...","[1080, 1060, 1020, 1021, 986, 979, 984, 1244, ..."


In [8]:
from sklearn.metrics.pairwise import cosine_distances
cosine10=pd.DataFrame(index=index,columns=columns)
for i in range(10):
    dist3=cosine_distances(np_10[i:i+1], np_after10)
    #convert to list and find the index
    min_index=dist3.argsort()[0][0:10].tolist()
    max_index=dist3.argsort()[0][-10:].tolist()
    #Index.origin = Index.sort + 10
    min_index=[x+10 for x in min_index]
    max_index=[x+10 for x in max_index]
    cosine10.at[i,'Nearest']=min_index
    cosine10.at[i,'Farthest']=max_index
    #print(min_index, max_index)


In [9]:
cosine10

Unnamed: 0,Nearest,Farthest
0,"[123, 385, 47, 933, 931, 847, 262, 1379, 1380,...","[1080, 1014, 911, 1021, 1020, 1287, 986, 915, ..."
1,"[752, 796, 314, 616, 615, 1314, 319, 317, 1173...","[1060, 1080, 1014, 1020, 1021, 1287, 986, 915,..."
2,"[196, 224, 17, 422, 424, 1353, 1352, 1230, 58,...","[912, 1080, 1014, 1020, 1021, 1287, 986, 915, ..."
3,"[402, 664, 17, 788, 787, 272, 1319, 19, 232, 353]","[1080, 911, 1014, 1021, 1020, 986, 1287, 915, ..."
4,"[123, 385, 47, 933, 931, 847, 262, 1379, 1380,...","[1080, 1014, 911, 1021, 1020, 1287, 986, 915, ..."
5,"[686, 1328, 931, 933, 1312, 857, 847, 551, 348...","[1080, 1014, 911, 1021, 1020, 1287, 986, 915, ..."
6,"[135, 140, 1502, 936, 935, 1283, 692, 1353, 13...","[912, 1080, 1014, 1020, 1021, 1287, 986, 915, ..."
7,"[1549, 1143, 1555, 1115, 1116, 1117, 1557, 155...","[1077, 1079, 986, 1081, 1287, 1235, 915, 911, ..."
8,"[948, 950, 949, 29, 51, 1236, 69, 762, 929, 443]","[1493, 1496, 192, 1184, 1188, 189, 188, 1235, ..."
9,"[11, 1288, 1289, 1445, 1441, 860, 864, 40, 39,...","[912, 1080, 1014, 1021, 1020, 1287, 986, 915, ..."


# White wine quality

In [10]:
import pandas as pd
import numpy as np

#load the data
df2 = pd.io.parsers.read_csv(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',
     header=0,
     sep=';'
    )

WhiteWine=df2[0:10]
WhiteRest=df2[10:]
WhiteWine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


# White wine: min-max normalized values, z-score normalized values, mean substracted normalized value

In [11]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing
cols = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']
std_scale = preprocessing.StandardScaler().fit(RedWine[cols])
WhiteWine_std = std_scale.transform(RedWine[cols])

#min-max normalized values, z-score normalized values
minmax_scale = preprocessing.MinMaxScaler().fit(WhiteWine[cols])
WhiteWine_minmax = minmax_scale.transform(WhiteWine[cols])
print('minmax',WhiteWine_minmax)
print('z-score',WhiteWine_std)

minmax [[0.42105263 0.5        0.74074074 1.         0.07142857 0.93939394
  0.82022472 1.         0.         0.55555556 0.         0.        ]
 [0.05263158 0.8        0.66666667 0.00520833 0.35714286 0.
  0.39325843 0.02777778 1.         1.         0.31818182 0.        ]
 [1.         0.6        0.88888889 0.28125    0.42857143 0.48484848
  0.         0.18055556 0.86666667 0.44444444 0.59090909 0.        ]
 [0.52631579 0.1        0.59259259 0.36458333 1.         1.
  1.         0.25       0.63333333 0.         0.5        0.        ]
 [0.52631579 0.1        0.59259259 0.36458333 1.         1.
  1.         0.25       0.63333333 0.         0.5        0.        ]
 [1.         0.6        0.88888889 0.28125    0.42857143 0.48484848
  0.         0.18055556 0.86666667 0.44444444 0.59090909 0.        ]
 [0.         1.         0.         0.28645833 0.07142857 0.48484848
  0.43820225 0.15277778 0.6        0.77777778 0.36363636 0.        ]
 [0.42105263 0.5        0.74074074 1.         0.07142857 0

In [12]:
#mean substracted normalized values
diff = WhiteWine-WhiteWine.mean()
diff.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.15,0.0,0.017,12.31,-0.0043,12.0,26.5,0.00499,-0.19,0.002,-0.92,0.0
1,-0.85,0.03,-0.003,-6.79,-0.0003,-19.0,-11.5,-0.00201,0.11,0.042,-0.22,0.0
2,0.95,0.01,0.057,-1.49,0.0007,-3.0,-46.5,-0.00091,0.07,-0.008,0.38,0.0
3,0.05,-0.04,-0.023,0.11,0.0087,14.0,42.5,-0.00041,4.440892e-16,-0.048,0.18,0.0
4,0.05,-0.04,-0.023,0.11,0.0087,14.0,42.5,-0.00041,4.440892e-16,-0.048,0.18,0.0


# White wine: manhatten distance, euclidean distance, cosine distance

In [13]:
import pandas as pd
index = range(10)
columns = ['Nearest','Farthest']

from sklearn.metrics.pairwise import euclidean_distances
#df first 10
np2_10=WhiteWine.values
#after 10
np2_after10=WhiteRest.values
euclidean2_10=pd.DataFrame(index=index,columns=columns)

for i in range(10):
    dist2_1=euclidean_distances(np2_10[i:i+1], np2_after10)
    #convert to list and find the index
    min_index2=dist2_1.argsort()[0][0:10].tolist()
    max_index2=dist2_1.argsort()[0][-10:].tolist()
    #Index.origin = Index.sort + 10
    min_index2=[x+10 for x in min_index2]
    max_index2=[x+10 for x in max_index2]
    euclidean2_10.at[i,'Nearest']=min_index2
    euclidean2_10.at[i,'Farthest']=max_index2
    #print(min_index, max_index)


In [14]:
euclidean2_10

Unnamed: 0,Nearest,Farthest
0,"[1487, 103, 2269, 191, 182, 14, 813, 773, 2005...","[740, 3094, 3095, 3901, 3710, 325, 1931, 2127,..."
1,"[956, 950, 955, 2732, 28, 563, 632, 1621, 1627...","[227, 2378, 3152, 2654, 3050, 2127, 325, 1931,..."
2,"[1726, 3890, 2737, 3779, 2149, 4194, 4192, 438...","[227, 2378, 3152, 3050, 2654, 325, 1931, 2127,..."
3,"[3395, 4751, 1492, 4451, 2241, 2236, 2253, 224...","[1544, 2673, 3119, 740, 3095, 3094, 1417, 3901..."
4,"[3395, 4751, 1492, 4451, 2241, 2236, 2253, 224...","[1544, 2673, 3119, 740, 3095, 3094, 1417, 3901..."
5,"[1726, 3890, 2737, 3779, 2149, 4194, 4192, 438...","[227, 2378, 3152, 3050, 2654, 325, 1931, 2127,..."
6,"[3602, 4826, 4825, 4588, 3648, 4718, 308, 3776...","[227, 2378, 3152, 2654, 3050, 325, 1931, 2127,..."
7,"[1487, 103, 2269, 191, 182, 14, 813, 773, 2005...","[740, 3094, 3095, 3901, 3710, 325, 1931, 2127,..."
8,"[956, 950, 955, 2732, 28, 563, 632, 1621, 1627...","[227, 2378, 3152, 2654, 3050, 2127, 325, 1931,..."
9,"[2723, 3891, 3882, 2356, 2088, 2467, 3526, 479...","[227, 2378, 3152, 2654, 3050, 325, 1931, 2127,..."


In [15]:
from sklearn.metrics.pairwise import manhattan_distances
manhattan2_10=pd.DataFrame(index=index,columns=columns)
for i in range(10):
    dist2_2=manhattan_distances(np2_10[i:i+1], np2_after10)
    #convert to list and find the index
    min_index2=dist2_2.argsort()[0][0:10].tolist()
    max_index2=dist2_2.argsort()[0][-10:].tolist()
    min_index2=[x+10 for x in min_index2]
    max_index2=[x+10 for x in max_index2]
    manhattan2_10.at[i,'Nearest']=min_index2
    manhattan2_10.at[i,'Farthest']=max_index2
    #print(min_index, max_index)

In [16]:
manhattan2_10

Unnamed: 0,Nearest,Farthest
0,"[103, 1487, 182, 191, 2269, 1996, 1994, 2005, ...","[3094, 740, 3119, 1417, 3050, 3710, 3901, 325,..."
1,"[956, 950, 955, 1275, 28, 1416, 1418, 4821, 21...","[1842, 2334, 659, 2654, 2127, 3050, 1417, 325,..."
2,"[1726, 2981, 4194, 4192, 3779, 2737, 4388, 101...","[3152, 2334, 659, 2654, 2127, 1417, 3050, 325,..."
3,"[3395, 4751, 1492, 3033, 4451, 1571, 1568, 224...","[2888, 325, 3094, 3095, 740, 3119, 1931, 3710,..."
4,"[3395, 4751, 1492, 3033, 4451, 1571, 1568, 224...","[2888, 325, 3094, 3095, 740, 3119, 1931, 3710,..."
5,"[1726, 2981, 4194, 4192, 3779, 2737, 4388, 101...","[3152, 2334, 659, 2654, 2127, 1417, 3050, 325,..."
6,"[3602, 4718, 4825, 4826, 4588, 308, 2949, 3648...","[3152, 2334, 659, 2654, 2127, 1417, 3050, 325,..."
7,"[103, 1487, 182, 191, 2269, 1996, 1994, 2005, ...","[3094, 740, 3119, 1417, 3050, 3710, 3901, 325,..."
8,"[956, 950, 955, 1275, 28, 1416, 1418, 4821, 21...","[1842, 2334, 659, 2654, 2127, 3050, 1417, 325,..."
9,"[3882, 3891, 2723, 2088, 4790, 2467, 2356, 352...","[1842, 2334, 659, 2654, 2127, 3050, 1417, 325,..."


In [17]:
from sklearn.metrics.pairwise import cosine_distances
cosine2_10=pd.DataFrame(index=index,columns=columns)
for i in range(10):
    dist2_3=cosine_distances(np2_10[i:i+1], np2_after10)
    #convert to list and find the index
    min_index2=dist2_3.argsort()[0][0:10].tolist()
    max_index2=dist2_3.argsort()[0][-10:].tolist()
    min_index2=[x+10 for x in min_index2]
    max_index2=[x+10 for x in max_index2]
    cosine2_10.at[i,'Nearest']=min_index2
    cosine2_10.at[i,'Farthest']=max_index2
    #print(min_index, max_index)

In [18]:
cosine2_10

Unnamed: 0,Nearest,Farthest
0,"[1529, 1524, 410, 414, 1487, 229, 225, 103, 41...","[1536, 1544, 2984, 2673, 3119, 740, 3095, 3094..."
1,"[24, 2545, 407, 557, 4434, 1509, 754, 955, 950...","[4745, 2673, 2984, 3869, 3861, 740, 3095, 3094..."
2,"[3393, 4527, 4517, 4297, 3760, 3390, 514, 434,...","[526, 2781, 2673, 3119, 2984, 740, 3095, 3094,..."
3,"[760, 748, 3330, 2669, 3530, 3534, 1506, 4330,...","[1544, 1536, 2673, 3119, 2984, 740, 3095, 3094..."
4,"[760, 748, 3330, 2669, 3530, 3534, 1506, 4330,...","[1544, 1536, 2673, 3119, 2984, 740, 3095, 3094..."
5,"[3393, 4527, 4517, 4297, 3760, 3390, 514, 434,...","[526, 2781, 2673, 3119, 2984, 740, 3095, 3094,..."
6,"[1493, 3555, 1494, 4825, 4826, 2837, 27, 4380,...","[1536, 1544, 2673, 3119, 2984, 740, 3095, 3094..."
7,"[1529, 1524, 410, 414, 1487, 229, 225, 103, 41...","[1536, 1544, 2984, 2673, 3119, 740, 3095, 3094..."
8,"[24, 2545, 407, 557, 4434, 1509, 754, 955, 950...","[4745, 2673, 2984, 3869, 3861, 740, 3095, 3094..."
9,"[1889, 439, 2117, 1589, 2723, 1046, 299, 3891,...","[1544, 1536, 3119, 2673, 2984, 740, 3095, 3094..."
