# Exploratory Data Analysis: 

These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are much more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines.

In [32]:
#read in file as dataframe 
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.mllib.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd
import os
import pyspark.sql.functions as F

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)


In [3]:
lines = pd.read_csv("winequalityN.csv") 

In [40]:
#convert pandas df into pyspark df
df = sqlCtx.createDataFrame(lines)

In [41]:
df.show(4)

+-----+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|           density|  pH|sulphates|alcohol|quality|
+-----+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+
|white|          7.0|            0.27|       0.36|          20.7|               0.045|               45.0|               170.0|             1.001| 3.0|     0.45|    8.8|      6|
|white|          6.3|             0.3|       0.34|           1.6|               0.049|               14.0|               132.0|0.9940000000000001| 3.3|     0.49|    9.5|      6|
|white|          8.1|            0.28|        0.4|           6.9|                0.05|               30.0|    

In [42]:
# count number of observations in df 
df.count()

6497

In [7]:
df.printSchema()

root
 |-- type: string (nullable = true)
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: long (nullable = true)



In [8]:
df.describe(["density", "quality"]).show()

+-------+-------------------+------------------+
|summary|            density|           quality|
+-------+-------------------+------------------+
|  count|               6497|              6497|
|   mean| 0.9946966338309992| 5.818377712790519|
| stddev|0.00299867300371893|0.8732552715311251|
|    min|            0.98711|                 3|
|    max|            1.03898|                 9|
+-------+-------------------+------------------+



In [16]:
min_quality = df.agg({'quality':'min'}).collect()[0][0]
max_quality = df.agg({'quality':'max'}).collect()[0][0]

max_quality

9

Quality Ranges from 3 to 9 in our data, presumably out of a 10 point scale

#Deal with outliers 
from pyspark.sql.functions import mean, stddev

Calculate values used for outlier filtering
mean_val = df.agg({'log_SalesClosePrice': 'mean'}).collect()[0][0]
stddev_val = df.agg({'log_SalesClosePrice': 'stddev'}).collect()[0][0]

Create three standard deviation (μ ± 3σ) lower and upper bounds for data
low_bound = mean_val - (3 * stddev_val)
hi_bound = mean_val + (3 * stddev_val)

Filter the data to fit between the lower and upper bounds
df = df.where((df['log_SalesClosePrice'] < hi_bound) & (df['log_SalesClosePrice'] > low_bound))

In [10]:
df.columns

['type',
 'fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

Possible ideas: 
1) Build a model to predict (classification) wine quality: high, medium, low'
1-3 is low, 4-6 is medium, 7-10 is high 
There is a lot more "medium" wines, than "low" or "high" quality 
2) build a model to preidct wine type: red or white

Drop Duplicates

In [57]:
# row count
print('rows={}'.format(df.count()))

rows=6497


In [56]:
# distinct row count
print('rows={}'.format(df.distinct().count()))

rows=5329


In [44]:
df_no_dups = df.dropDuplicates()
df2 = df_no_dups

Inspect Number of wines in each category 

In [45]:
low = df2.filter((df2.quality > 0) & (df2.quality <= 3)).count()
low

30

In [46]:
medium = df2.filter((df2.quality >= 4) & (df2.quality <= 6)).count()
medium

4288

In [47]:
high = df2.filter((df2.quality > 6 ) & (df2.quality < 10)).count()
high

1011

In [51]:
total = df2.count()
#percentage low
low_per = low/total
low_per

0.00562957402889848

In [52]:
medium_per = medium/total
medium_per

0.8046537811972228

In [53]:
high_per = high/total
high_per

0.18971664477387878

So, approximatly:

    Low: 0.005 %
    Medium: 0.805 %
    High: 0.190 %
We have to figure out a way to deal with low's small amount 

Idenitfy columns with a lot of missing values 

In [58]:
df2.agg(*[
    (1 - F.count(c) / F.count('*')).alias(c + '_missing')
    for c in df.columns
]).show()



+------------+---------------------+------------------------+-------------------+----------------------+-----------------+---------------------------+----------------------------+---------------+----------+-----------------+---------------+---------------+
|type_missing|fixed acidity_missing|volatile acidity_missing|citric acid_missing|residual sugar_missing|chlorides_missing|free sulfur dioxide_missing|total sulfur dioxide_missing|density_missing|pH_missing|sulphates_missing|alcohol_missing|quality_missing|
+------------+---------------------+------------------------+-------------------+----------------------+-----------------+---------------------------+----------------------------+---------------+----------+-----------------+---------------+---------------+
|         0.0|                  0.0|                     0.0|                0.0|                   0.0|              0.0|                        0.0|                         0.0|            0.0|       0.0|              0.0|     

It appears there are no missing values in the data 

In [60]:
cols = [c for c in df2.columns if c != 'type']   # exclude id from features
bounds = {} # will store lower and upper bounds for each feature

In [61]:
for col in cols:
    quantiles = df2.approxQuantile(col, [0.25, 0.75], 0.05)
    IQR = quantiles[1] - quantiles[0] 

    bounds[col] = [
     quantiles[0] - 1.5 * IQR,
     quantiles[1] + 1.5 * IQR
    ]

In [62]:
# append outlier indicators to data table

outliers = df2.select(*['type'] + [
 (
 (df2[c] < bounds[c][0]) | (df2[c] > bounds[c][1]))
    .alias(c + '_outlier') for c in cols
])

In [None]:
outliers.show(4)

In [72]:
for col in outliers.columns:
    new = outliers.filter(outliers[col] == True)

In [81]:
outliers.select('quality_outlier', 'alcohol_outlier').show(2)

+---------------+---------------+
|quality_outlier|alcohol_outlier|
+---------------+---------------+
|          false|          false|
|          false|          false|
+---------------+---------------+
only showing top 2 rows



#Deal with outliers from pyspark.sql.functions import mean, stddev
#Calculate values used for outlier filtering 
mean_val = df.agg({'log_SalesClosePrice': 'mean'}).collect()[0][0] stddev_val = df.agg({'log_SalesClosePrice': 'stddev'}).collect()[0][0]

#Create three standard deviation (μ ± 3σ) lower and upper bounds for data 
low_bound =  mean_val - (3 * stddev_val) hi_bound = mean_val + (3 * stddev_val)

#Filter the data to fit between the lower and upper bounds 
df = df.where((df['log_SalesClosePrice'] < hi_bound) & (df['log_SalesClosePrice'] > low_bound))